diff --git a/Bank_second_part/detect_process/PP_TSMv2_infer.py b/Bank_second_part/detect_process/PP_TSMv2_infer.py
new file mode 100644
index 0000000..0a2b9de
--- /dev/null
+++ b/Bank_second_part/detect_process/PP_TSMv2_infer.py
@@ -0,0 +1,164 @@
+import os 
+import os.path as osp
+from paddlevideo.utils.config import get_config
+from paddle.inference import Config, create_predictor
+from tools.utils import build_inference_helper
+
+class PP_TSMv2_predict(object):
+
+    """PP-TSMv2模型中常用的参数初始化"""
+
+    def __init__(self,use_gpu=True,ir_optim=True,
+                 disable_glog=False,save_name=None,enable_mklddn=False,
+                 precision="fp32",gpu_mem=8000,cpu_threads=None):
+
+        self.use_gpu = use_gpu                                 #是否使用GPU
+        self.cpu_threads = cpu_threads                         #cpu线程数
+        self.ir_optim = ir_optim                               #是否开启IR优化
+        self.disable_glog = disable_glog
+        self.gpu_mem = gpu_mem                                 #GPU存储大小
+        self.enable_mkldnn = enable_mklddn                     #是否开启mkldnn
+        self.precision = precision                             #mfldnn精度
+        self.save_name = save_name                             #转化推理模型存放名称
+
+
+
+    def parse_file_paths(self,input_path: str) -> list:
+        
+        """
+            获取模型输入数据
+            input_path:模型的输入文件
+        """
+        if osp.isfile(input_path):
+            files = [
+                input_path,
+            ]
+        else:
+            files = os.listdir(input_path)
+            files = [
+                file for file in files
+                if (file.endswith(".avi") or file.endswith(".mp4"))
+            ]
+            files = [osp.join(input_path, file) for file in files]
+        return files
+
+
+    def create_paddle_predictor(self,model_f,pretr_p,cfg):
+        """
+            创建推理引擎
+            model_f:可推理模型存放的路径+配置文件
+            pretr_p:训练后的参数存放文件
+            cfg:模型配置文件
+
+        """
+        config = Config(model_f,pretr_p)
+        if self.use_gpu:
+            config.enable_use_gpu(self.gpu_mem,0)
+        else:
+            config.disable_gpu()
+            if self.cpu_threads:
+                config.set_cpu_math_library_num_threads(self.cpu_threads)
+            if self.enable_mkldnn:
+                config.set_mkldnn_cache_capacity(10)
+                config.enable_mkldnn()
+                if self.precision == "fp16":
+                    config.enable_mkldnn_bfloat16()
+
+        config.switch_ir_optim(self.ir_optim)
+        
+        config.enable_memory_optim()
+        config.switch_use_feed_fetch_ops(False)
+
+        if self.disable_glog:
+            config.disable_glog_info()
+
+        predictor = create_predictor(config)
+
+        return config,predictor
+
+    def create_inference_model(self,config,model_f,params_f):
+        """
+            创建推理模型以及引擎
+            config：模型配置文件
+            model_f：可推理模型的存放路径
+            params_f：可推理模型的参数
+        """
+        cfg = get_config(config, overrides=None, show=False)
+        InferenceHelper = build_inference_helper(cfg.INFERENCE)
+        _, predictor = self.create_paddle_predictor(model_f, params_f, cfg)
+
+        return InferenceHelper,predictor
+
+
+    def predict(self,input_f,batch_size,predictor,InferenceHelper):
+
+        """
+            推理模型,对数据进行推理、预测
+            config :PP-TSMv2模型的配置文件
+            input_f:待推理数据集的存放路径
+            batch_size:模型推理中所取数据的多少,default = 1
+            predictor:推理引擎
+            InferenceHelper:推理模型
+        """
+        result = {}
+
+        # cfg = get_config(config, overrides=None, show=False)
+        # model_name = cfg.model_name
+        # print(f"Inference model({model_name})...")
+
+        # get input_tensor and output_tensor
+        input_names = predictor.get_input_names()
+        output_names = predictor.get_output_names()
+        input_tensor_list = []
+        output_tensor_list = []
+        for item in input_names:
+            input_tensor_list.append(predictor.get_input_handle(item))
+        for item in output_names:
+            output_tensor_list.append(predictor.get_output_handle(item))
+
+        files = self.parse_file_paths(input_f)#input_path=input_f
+
+        batch_num = batch_size
+        for st_idx in range(0, len(files), batch_num):
+            ed_idx = min(st_idx + batch_num, len(files))
+
+            #输出数据预处理
+            batched_inputs = InferenceHelper.preprocess_batch(files[st_idx:ed_idx])
+            for i in range(len(input_tensor_list)):
+                input_tensor_list[i].copy_from_cpu(batched_inputs[i])
+
+            #推理引擎开始推理
+            predictor.run()
+
+            batched_outputs = []
+            for j in range(len(output_tensor_list)):
+                batched_outputs.append(output_tensor_list[j].copy_to_cpu())
+
+            #输出推理结果
+            res = InferenceHelper.postprocess(batched_outputs,False,True)     
+        result["video_id"] = res[0]["video_id"]
+        result["topk_class"] = res[0]["topk_class"].tolist()[0]
+        result["topk_scores"] = res[0]["topk_scores"].tolist()[0]
+        # print(result)
+        
+        return result
+            
+
+
+# def main():
+#     config = 'D:/download/PaddleVideo1/output/output/pptsm_lcnet_k400_16frames_uniform.yaml'  # 配置文件地址
+#     input_file='C:/Users/Administrator/Pictures/video_seg_re_hand/test01_3.avi'                      #待推理数据集存放的地址
+#     model_file = 'D:/download/PaddleVideo1/output/output/ppTSMv2.pdmodel'  # 推理模型存放地址
+#     params_file = 'D:/download/PaddleVideo1/output/output/ppTSMv2.pdiparams'
+#     batch_size= 1                                                          #输出推理模型
+#     infer,predictor = PP_TSMv2_predict().create_inference_model(config,model_file,params_file)
+#     PP_TSMv2_predict().predict(config,input_file,batch_size,predictor,infer)                           #推理模型推理、预测
+
+    
+    
+    
+# if __name__ == "__main__":
+#     main()
+
+        
+
diff --git a/Bank_second_part/detect_process/analysisPoint.py b/Bank_second_part/detect_process/analysisPoint.py
new file mode 100644
index 0000000..2297fdc
--- /dev/null
+++ b/Bank_second_part/detect_process/analysisPoint.py
@@ -0,0 +1,152 @@
+# Copyright 2020 The MediaPipe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MediaPipe solution drawing utils."""
+
+import math
+from typing import List, Mapping, Optional, Tuple, Union
+
+import cv2
+import dataclasses
+import matplotlib.pyplot as plt
+import numpy as np
+
+from mediapipe.framework.formats import detection_pb2
+from mediapipe.framework.formats import location_data_pb2
+from mediapipe.framework.formats import landmark_pb2
+
+_PRESENCE_THRESHOLD = 0.5
+_VISIBILITY_THRESHOLD = 0.5
+_BGR_CHANNELS = 3
+
+WHITE_COLOR = (224, 224, 224)
+BLACK_COLOR = (0, 0, 0)
+RED_COLOR = (0, 0, 255)
+GREEN_COLOR = (0, 128, 0)
+BLUE_COLOR = (255, 0, 0)
+
+
+@dataclasses.dataclass
+class DrawingSpec:
+  # Color for drawing the annotation. Default to the white color.
+  color: Tuple[int, int, int] = WHITE_COLOR
+  # Thickness for drawing the annotation. Default to 2 pixels.
+  thickness: int = 2
+  # Circle radius. Default to 2 pixels.
+  circle_radius: int = 2
+
+
+def _normalized_to_pixel_coordinates(
+    normalized_x: float, normalized_y: float, image_width: int,
+    image_height: int) -> Union[None, Tuple[int, int]]:
+  """Converts normalized value pair to pixel coordinates."""
+
+  # Checks if the float value is between 0 and 1.
+  def is_valid_normalized_value(value: float) -> bool:
+    return (value > 0 or math.isclose(0, value)) and (value < 1 or
+                                                      math.isclose(1, value))
+
+  if not (is_valid_normalized_value(normalized_x) and
+          is_valid_normalized_value(normalized_y)):
+    # TODO: Draw coordinates even if it's outside of the image bounds.
+    return None
+  x_px = min(math.floor(normalized_x * image_width), image_width - 1)
+  y_px = min(math.floor(normalized_y * image_height), image_height - 1)
+  return x_px, y_px
+
+
+
+def draw_landmarks(
+    image: np.ndarray,
+    landmark_list: landmark_pb2.NormalizedLandmarkList,
+    connections: Optional[List[Tuple[int, int]]] = None):
+  """Draws the landmarks and the connections on the image.
+
+  Args:
+    image: A three channel BGR image represented as numpy ndarray.
+    landmark_list: A normalized landmark list proto message to be annotated on
+      the image.
+    connections: A list of landmark index tuples that specifies how landmarks to
+      be connected in the drawing.
+    landmark_drawing_spec: Either a DrawingSpec object or a mapping from hand
+      landmarks to the DrawingSpecs that specifies the landmarks' drawing
+      settings such as color, line thickness, and circle radius. If this
+      argument is explicitly set to None, no landmarks will be drawn.
+    connection_drawing_spec: Either a DrawingSpec object or a mapping from hand
+      connections to the DrawingSpecs that specifies the connections' drawing
+      settings such as color and line thickness. If this argument is explicitly
+      set to None, no landmark connections will be drawn.
+
+  Raises:
+    ValueError: If one of the followings:
+      a) If the input image is not three channel BGR.
+      b) If any connetions contain invalid landmark index.
+  """
+  if not landmark_list:
+    return
+  if image.shape[2] != _BGR_CHANNELS:
+    raise ValueError('Input image must contain three channel bgr data.')
+  image_rows, image_cols, _ = image.shape
+
+  # 所有的点转换成坐标的字典
+  idx_to_coordinates = {}
+  for idx, landmark in enumerate(landmark_list.landmark):
+    # print('landmark:',landmark)
+    if ((landmark.HasField('visibility') and
+         landmark.visibility < _VISIBILITY_THRESHOLD) or
+        (landmark.HasField('presence') and
+         landmark.presence < _PRESENCE_THRESHOLD)):
+      continue
+    landmark_px = _normalized_to_pixel_coordinates(landmark.x, landmark.y,
+                                                   image_cols, image_rows)
+    # print('landmark_px:',landmark_px)
+    if landmark_px:
+      idx_to_coordinates[idx] = landmark_px
+
+
+  if connections:
+    num_landmarks = len(landmark_list.landmark)
+    # print('connections:',connections)
+
+    # Draws the connections if the start and end landmarks are both visible.
+    
+    start_list = []
+    end_list = []
+    for connection in connections:
+      # print(connection)
+
+      start_idx = connection[0]
+      end_idx = connection[1]
+
+      start_list.append(start_idx)
+      end_list.append(end_idx)
+
+    
+    point_list = []
+    for point_idx in end_list:
+
+      # if point_idx not in start_list:
+      
+              # print(point_idx)
+        point_list.append(point_idx)
+
+
+    point_axis_list = []
+    for point in point_list:
+      
+      if point in list(idx_to_coordinates.keys()):
+        point_axis_list.append(idx_to_coordinates[point])
+       
+      
+  return point_axis_list
+      
\ No newline at end of file
diff --git a/Bank_second_part/detect_process/holisticDet.py b/Bank_second_part/detect_process/holisticDet.py
new file mode 100644
index 0000000..f282bdf
--- /dev/null
+++ b/Bank_second_part/detect_process/holisticDet.py
@@ -0,0 +1,104 @@
+import cv2
+import mediapipe as mp
+
+import analysisPoint as mp_drawing
+mp_holistic = mp.solutions.holistic
+import numpy as np
+
+class MediapipeProcess:
+
+  def mediapipe_det(image,holistic):
+
+    '''
+    调用模型推理获得检测结果
+    '''
+
+    image.flags.writeable = False
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    results = holistic.process(image)
+
+    return results
+
+  def get_analysis_result(image,results):
+
+    '''
+    images: 检测的图片
+    results: 图片的检测结果
+    对上述结果进行分析
+    '''
+
+    image.flags.writeable = True
+    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+
+    face_result = mp_drawing.draw_landmarks(
+              image,
+              results.face_landmarks,
+              mp_holistic.FACEMESH_CONTOURS)    
+    
+    right_hand_result = mp_drawing.draw_landmarks(
+              image,
+              results.right_hand_landmarks,
+              mp_holistic.HAND_CONNECTIONS)
+
+    left_hand_result = mp_drawing.draw_landmarks(
+              image,
+              results.left_hand_landmarks,
+              mp_holistic.HAND_CONNECTIONS)
+    
+    face_bbox = MediapipeProcess.point_to_bbox(face_result)
+    right_hand_bbox = MediapipeProcess.point_to_bbox(right_hand_result)
+    left_hand_bbox = MediapipeProcess.point_to_bbox(left_hand_result)
+
+    result_dict = {'face_bbox':[face_bbox],'hand_bbox':[right_hand_bbox,left_hand_bbox]}
+
+
+    return result_dict
+        
+
+
+  def point_to_bbox(result_list):
+      
+    '''
+    根据关键点坐标，获取坐标点的最小外接矩形
+    '''
+      
+    result_array = np.array(result_list)
+      
+    if result_array.all():
+
+      rect = cv2.minAreaRect(result_array) # 得到最小外接矩形的（中心(x,y), (宽,高), 旋转角度）
+      bbox = cv2.boxPoints(rect) # 获取最小外接矩形的4个顶点坐标(ps: cv2.boxPoints(rect) for OpenCV 3.x)
+      bbox = np.int0(bbox)
+      bbox=bbox.tolist() 
+
+      left_top = [min(bbox, key=lambda p: p[0])[0], min(bbox, key=lambda p: p[1])[1]]
+      right_bottom = [max(bbox, key=lambda p: p[0])[0], max(bbox, key=lambda p: p[1])[1]]
+
+      bbox_list = left_top + right_bottom
+
+      # print('bbox:',bbox)
+      # print('bbox_list:',bbox_list)
+
+
+      # bbox_list = []
+
+      # bbox_list.append(bbox[0][0])
+      # bbox_list.append(bbox[0][1])
+      # bbox_list.append(bbox[2][0])
+      # bbox_list.append(bbox[2][1])
+ 
+      return bbox_list
+
+    else:
+      pass 
+
+
+
+
+
+   
+
+
+# if __name__ == '__main__':  
+#   # media_holistic(video_file='E:/Bank_files/Bank_02/dataset/video_person/after_1/0711-1_199_0.avi',
+  #                video_save_path='E:/Bank_files/Bank_02/videos_mediapipe/test_data/0725_test')
\ No newline at end of file
diff --git a/Bank_second_part/detect_process/paddlevideo/__init__.py b/Bank_second_part/detect_process/paddlevideo/__init__.py
new file mode 100644
index 0000000..8b03acf
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .version import paddlevideo_version
diff --git a/Bank_second_part/detect_process/paddlevideo/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..0c62e18
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/__pycache__/version.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/__pycache__/version.cpython-310.pyc
new file mode 100644
index 0000000..4a30493
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/__pycache__/version.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/__init__.py b/Bank_second_part/detect_process/paddlevideo/loader/__init__.py
new file mode 100644
index 0000000..4ed9b11
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .builder import build_dataset, build_dataloader, build_batch_pipeline
+from .dataset import VideoDataset
+from .dali_loader import TSN_Dali_loader, get_input_data
+
+__all__ = [
+    'build_dataset', 'build_dataloader', 'build_batch_pipeline', 'VideoDataset',
+    'TSN_Dali_loader', 'get_input_data'
+]
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..55be032
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/builder.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/builder.cpython-310.pyc
new file mode 100644
index 0000000..44939ca
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/builder.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/dali_loader.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/dali_loader.cpython-310.pyc
new file mode 100644
index 0000000..b3d04dc
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/dali_loader.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/registry.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/registry.cpython-310.pyc
new file mode 100644
index 0000000..ffc2596
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/registry.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/builder.py b/Bank_second_part/detect_process/paddlevideo/loader/builder.py
new file mode 100644
index 0000000..23a65c3
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/builder.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import signal
+import os
+import paddle
+from paddle.io import DataLoader, DistributedBatchSampler
+from .registry import DATASETS, PIPELINES
+from ..utils.build_utils import build
+from .pipelines.compose import Compose
+from paddlevideo.utils import get_logger
+from paddlevideo.utils.multigrid import DistributedShortSampler
+import numpy as np
+
+logger = get_logger("paddlevideo")
+
+
+def build_pipeline(cfg):
+    """Build pipeline.
+    Args:
+        cfg (dict): root config dict.
+    """
+    if cfg == None:
+        return
+    return Compose(cfg)
+
+
+def build_dataset(cfg):
+    """Build dataset.
+    Args:
+        cfg (dict): root config dict.
+
+    Returns:
+        dataset: dataset.
+    """
+    #XXX: ugly code here!
+    cfg_dataset, cfg_pipeline = cfg
+    cfg_dataset.pipeline = build_pipeline(cfg_pipeline)
+    dataset = build(cfg_dataset, DATASETS, key="format")
+    return dataset
+
+
+def build_batch_pipeline(cfg):
+
+    batch_pipeline = build(cfg, PIPELINES)
+    return batch_pipeline
+
+
+def build_dataloader(dataset,
+                     batch_size,
+                     num_workers,
+                     places,
+                     shuffle=True,
+                     drop_last=True,
+                     multigrid=False,
+                     collate_fn_cfg=None,
+                     **kwargs):
+    """Build Paddle Dataloader.
+
+    XXX explain how the dataloader work!
+
+    Args:
+        dataset (paddle.dataset): A PaddlePaddle dataset object.
+        batch_size (int): batch size on single card.
+        num_worker (int): num_worker
+        shuffle(bool): whether to shuffle the data at every epoch.
+    """
+    if multigrid:
+        sampler = DistributedShortSampler(dataset,
+                                          batch_sizes=batch_size,
+                                          shuffle=True,
+                                          drop_last=True)
+    else:
+        sampler = DistributedBatchSampler(dataset,
+                                          batch_size=batch_size,
+                                          shuffle=shuffle,
+                                          drop_last=drop_last)
+
+    #NOTE(shipping): when switch the mix operator on, such as: mixup, cutmix.
+    # batch like: [[img, label, attibute, ...], [imgs, label, attribute, ...], ...] will recollate to:
+    # [[img, img, ...], [label, label, ...], [attribute, attribute, ...], ...] as using numpy.transpose.
+
+    def mix_collate_fn(batch):
+        pipeline = build_batch_pipeline(collate_fn_cfg)
+        batch = pipeline(batch)
+        slots = []
+        for items in batch:
+            for i, item in enumerate(items):
+                if len(slots) < len(items):
+                    slots.append([item])
+                else:
+                    slots[i].append(item)
+        return [np.stack(slot, axis=0) for slot in slots]
+
+    #if collate_fn_cfg is not None:
+    #ugly code here. collate_fn is mix op config
+    #    collate_fn = mix_collate_fn(collate_fn_cfg)
+
+    data_loader = DataLoader(
+        dataset,
+        batch_sampler=sampler,
+        places=places,
+        num_workers=num_workers,
+        collate_fn=mix_collate_fn if collate_fn_cfg is not None else None,
+        return_list=True,
+        **kwargs)
+
+    return data_loader
+
+
+def term_mp(sig_num, frame):
+    """ kill all child processes
+    """
+    pid = os.getpid()
+    pgid = os.getpgid(os.getpid())
+    logger.info("main proc {} exit, kill process group " "{}".format(pid, pgid))
+    os.killpg(pgid, signal.SIGKILL)
+    return
+
+
+signal.signal(signal.SIGINT, term_mp)
+signal.signal(signal.SIGTERM, term_mp)
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dali_loader.py b/Bank_second_part/detect_process/paddlevideo/loader/dali_loader.py
new file mode 100644
index 0000000..4fb0e28
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dali_loader.py
@@ -0,0 +1,206 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import math
+
+import paddle
+from paddle.distributed import ParallelEnv
+import paddle.distributed as dist
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+try:
+    from nvidia.dali.pipeline import Pipeline
+    import nvidia.dali.ops as ops
+    import nvidia.dali.types as types
+    import tempfile
+    from nvidia.dali.plugin.paddle import DALIGenericIterator
+except:
+    Pipeline = object
+
+
+def get_input_data(data):
+    return paddle.to_tensor(data[0]['image']), paddle.to_tensor(
+        data[0]['label'])
+
+
+class TSN_Dali_loader(object):
+    def __init__(self, cfg):
+        self.batch_size = cfg.batch_size
+        self.file_path = cfg.file_path
+
+        self.num_seg = cfg.num_seg
+        self.seglen = cfg.seglen
+        self.short_size = cfg.short_size
+        self.target_size = cfg.target_size
+
+        # set num_shards and shard_id when distributed training is implemented
+        self.num_shards = dist.get_world_size()
+        self.shard_id = ParallelEnv().local_rank
+        self.dali_mean = cfg.mean * (self.num_seg * self.seglen)
+        self.dali_std = cfg.std * (self.num_seg * self.seglen)
+
+    def build_dali_reader(self):
+        """
+        build dali training reader
+        """
+        def reader_():
+            with open(self.file_path) as flist:
+                full_lines = [line for line in flist]
+                if (not hasattr(reader_, 'seed')):
+                    reader_.seed = 0
+                random.Random(reader_.seed).shuffle(full_lines)
+                logger.info(f"reader shuffle seed: {reader_.seed}.")
+                if reader_.seed is not None:
+                    reader_.seed += 1
+
+                per_node_lines = int(
+                    math.ceil(len(full_lines) * 1.0 / self.num_shards))
+                total_lines = per_node_lines * self.num_shards
+
+                # aligned full_lines so that it can evenly divisible
+                full_lines += full_lines[:(total_lines - len(full_lines))]
+                assert len(full_lines) == total_lines
+
+                # trainer get own sample
+                lines = full_lines[self.shard_id:total_lines:self.num_shards]
+                assert len(lines) == per_node_lines
+
+                logger.info(
+                    f"shard_id: {self.shard_id}, trainer_count: {self.num_shards}"
+                )
+                logger.info(
+                    f"read videos from {self.shard_id * per_node_lines}, "
+                    f"length: {per_node_lines}, "
+                    f"lines length: {len(lines)}, "
+                    f"total: {len(full_lines)}")
+
+            video_files = ''.join([item for item in lines])
+            tf = tempfile.NamedTemporaryFile()
+            tf.write(str.encode(video_files))
+            tf.flush()
+            video_files = tf.name
+
+            device_id = ParallelEnv().local_rank
+            logger.info(f'---------- device_id: {device_id} -----------')
+
+            pipe = VideoPipe(batch_size=self.batch_size,
+                             num_threads=1,
+                             device_id=device_id,
+                             file_list=video_files,
+                             sequence_length=self.num_seg * self.seglen,
+                             num_seg=self.num_seg,
+                             seg_length=self.seglen,
+                             resize_shorter_scale=self.short_size,
+                             crop_target_size=self.target_size,
+                             is_training=True,
+                             num_shards=self.num_shards,
+                             shard_id=self.shard_id,
+                             dali_mean=self.dali_mean,
+                             dali_std=self.dali_std)
+
+            logger.info(
+                'initializing dataset, it will take several minutes if it is too large .... '
+            )
+            video_loader = DALIGenericIterator([pipe], ['image', 'label'],
+                                               len(lines),
+                                               dynamic_shape=True,
+                                               auto_reset=True)
+
+            return video_loader
+
+        dali_reader = reader_()
+        return dali_reader
+
+
+class VideoPipe(Pipeline):
+    def __init__(self,
+                 batch_size,
+                 num_threads,
+                 device_id,
+                 file_list,
+                 sequence_length,
+                 num_seg,
+                 seg_length,
+                 resize_shorter_scale,
+                 crop_target_size,
+                 is_training=False,
+                 initial_prefetch_size=20,
+                 num_shards=1,
+                 shard_id=0,
+                 dali_mean=0.,
+                 dali_std=1.0):
+        super(VideoPipe, self).__init__(batch_size, num_threads, device_id)
+        self.input = ops.VideoReader(device="gpu",
+                                     file_list=file_list,
+                                     sequence_length=sequence_length,
+                                     num_seg=num_seg,
+                                     seg_length=seg_length,
+                                     is_training=is_training,
+                                     num_shards=num_shards,
+                                     shard_id=shard_id,
+                                     random_shuffle=is_training,
+                                     initial_fill=initial_prefetch_size)
+        # the sequece data read by ops.VideoReader is of shape [F, H, W, C]
+        # Because the ops.Resize does not support sequence data,
+        # it will be transposed into [H, W, F, C],
+        # then reshaped to [H, W, FC], and then resized like a 2-D image.
+        self.transpose = ops.Transpose(device="gpu", perm=[1, 2, 0, 3])
+        self.reshape = ops.Reshape(device="gpu",
+                                   rel_shape=[1.0, 1.0, -1],
+                                   layout='HWC')
+        self.resize = ops.Resize(device="gpu",
+                                 resize_shorter=resize_shorter_scale)
+        # crops and mirror are applied by ops.CropMirrorNormalize.
+        # Normalization will be implemented in paddle due to the difficulty of dimension broadcast,
+        # It is not sure whether dimension broadcast can be implemented correctly by dali, just take the Paddle Op instead.
+        self.pos_rng_x = ops.Uniform(range=(0.0, 1.0))
+        self.pos_rng_y = ops.Uniform(range=(0.0, 1.0))
+        self.mirror_generator = ops.Uniform(range=(0.0, 1.0))
+        self.cast_mirror = ops.Cast(dtype=types.DALIDataType.INT32)
+        self.crop_mirror_norm = ops.CropMirrorNormalize(
+            device="gpu",
+            crop=[crop_target_size, crop_target_size],
+            mean=dali_mean,
+            std=dali_std)
+        self.reshape_back = ops.Reshape(
+            device="gpu",
+            shape=[num_seg, seg_length * 3, crop_target_size, crop_target_size],
+            layout='FCHW')
+        self.cast_label = ops.Cast(device="gpu", dtype=types.DALIDataType.INT64)
+
+    def define_graph(self):
+        output, label = self.input(name="Reader")
+        output = self.transpose(output)
+        output = self.reshape(output)
+
+        output = self.resize(output)
+        output = output / 255.
+        pos_x = self.pos_rng_x()
+        pos_y = self.pos_rng_y()
+        mirror_flag = self.mirror_generator()
+        mirror_flag = (mirror_flag > 0.5)
+        mirror_flag = self.cast_mirror(mirror_flag)
+        output = self.crop_mirror_norm(output,
+                                       crop_pos_x=pos_x,
+                                       crop_pos_y=pos_y,
+                                       mirror=mirror_flag)
+        output = self.reshape_back(output)
+        label = self.cast_label(label)
+        return output, label
+
+    def __len__(self):
+        return self.epoch_size()
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/MRI.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/MRI.py
new file mode 100644
index 0000000..990cb87
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/MRI.py
@@ -0,0 +1,109 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class MRIDataset(BaseDataset):
+    """Rawframe dataset for action recognition.
+    The dataset loads raw frames from frame files, and apply specified transform operatation them.
+    The indecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.
+    Example of an index file:
+
+    .. code-block:: txt
+
+        file_path-1 150 1
+        file_path-2 160 1
+        file_path-3 170 2
+        file_path-4 180 2
+
+    Args:
+        file_path (str): Path to the index file.
+        pipeline(XXX):
+        data_prefix (str): directory path of the data. Default: None.
+        test_mode (bool): Whether to bulid the test dataset. Default: False.
+        suffix (str): suffix of file. Default: 'img_{:05}.jpg'.
+
+    """
+    def __init__(self,
+                 file_path,
+                 pipeline,
+                 num_retries=5,
+                 data_prefix=None,
+                 test_mode=False,
+                 suffix='img_{:05}.jpg'):
+        self.num_retries = num_retries
+        self.suffix = suffix
+        super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split()
+                frame_dir, frames_len, labels = line_split
+                if self.data_prefix is not None:
+                    frame_dir = osp.join(self.data_prefix, frame_dir)
+                info.append(
+                    dict(
+                        frame_dir=frame_dir,
+                        #suffix=self.suffix,
+                        frames_len=frames_len,
+                        labels=int(labels)))
+        return info
+
+    def prepare_train(self, idx):
+        """Prepare the frames for training/valid gisven index. """
+        #Try to catch Exception caused by reading missing frames files
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['frame_dir'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return np.array(results['imgs']), np.array([results['labels']])
+
+    def prepare_test(self, idx):
+        """Prepare the frames for test given index. """
+        #Try to catch Exception caused by reading missing frames files
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['frame_dir'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return np.array(results['imgs']), np.array([results['labels']])
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/MRI_SlowFast.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/MRI_SlowFast.py
new file mode 100644
index 0000000..db905e4
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/MRI_SlowFast.py
@@ -0,0 +1,111 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class SFMRIDataset(BaseDataset):
+    """Rawframe dataset for action recognition.
+    The dataset loads raw frames from frame files, and apply specified transform operatation them.
+    The indecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.
+    Example of an index file:
+
+    .. code-block:: txt
+
+        file_path-1 150 1
+        file_path-2 160 1
+        file_path-3 170 2
+        file_path-4 180 2
+
+    Args:
+        file_path (str): Path to the index file.
+        pipeline(XXX):
+        data_prefix (str): directory path of the data. Default: None.
+        test_mode (bool): Whether to bulid the test dataset. Default: False.
+        suffix (str): suffix of file. Default: 'img_{:05}.jpg'.
+
+    """
+    def __init__(self,
+                 file_path,
+                 pipeline,
+                 num_retries=5,
+                 data_prefix=None,
+                 test_mode=False,
+                 suffix='img_{:05}.jpg'):
+        self.num_retries = num_retries
+        self.suffix = suffix
+        super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split()
+                frame_dir, frames_len, labels = line_split
+                if self.data_prefix is not None:
+                    frame_dir = osp.join(self.data_prefix, frame_dir)
+                info.append(
+                    dict(
+                        frame_dir=frame_dir,
+                        #suffix=self.suffix,
+                        frames_len=frames_len,
+                        labels=int(labels)))
+        return info
+
+    def prepare_train(self, idx):
+        """Prepare the frames for training/valid gisven index. """
+        #Try to catch Exception caused by reading missing frames files
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['frame_dir'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return np.array(results['imgs'][0]), np.array(
+                results['imgs'][1]), np.array([results['labels']])
+
+    def prepare_test(self, idx):
+        """Prepare the frames for test given index. """
+        #Try to catch Exception caused by reading missing frames files
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['frame_dir'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return np.array(results['imgs'][0]), np.array(
+                results['imgs'][1]), np.array([results['labels']])
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__init__.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__init__.py
new file mode 100644
index 0000000..e974191
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__init__.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .actbert_dataset import ActBertDataset
+from .ava_dataset import AVADataset
+from .bmn_dataset import BMNDataset
+from .davis_dataset import DavisDataset
+from .feature import FeatureDataset
+from .frame import FrameDataset, FrameDataset_Sport
+from .MRI import MRIDataset
+from .MRI_SlowFast import SFMRIDataset
+from .msrvtt import MSRVTTDataset
+from .actbert_dataset import ActBertDataset
+from .asrf_dataset import ASRFDataset
+from .ms_tcn_dataset import MSTCNDataset
+from .oxford import MonoDataset
+from .skeleton import SkeletonDataset
+from .slowfast_video import SFVideoDataset
+from .video import VideoDataset
+from .ucf101_skeleton import UCF101SkeletonDataset
+from .ucf24_dataset import UCF24Dataset
+
+
+__all__ = [
+    'VideoDataset', 'FrameDataset', 'SFVideoDataset', 'BMNDataset',
+    'FeatureDataset', 'SkeletonDataset', 'AVADataset', 'MonoDataset',
+    'MSRVTTDataset', 'ActBertDataset', 'DavisDataset', 'MRIDataset',
+    'SFMRIDataset', 'FrameDataset_Sport', 'MSTCNDataset', 'ASRFDataset',
+    'UCF101SkeletonDataset', 'UCF24Dataset'
+]
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/MRI.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/MRI.cpython-310.pyc
new file mode 100644
index 0000000..58b431f
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/MRI.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/MRI_SlowFast.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/MRI_SlowFast.cpython-310.pyc
new file mode 100644
index 0000000..08be8bc
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/MRI_SlowFast.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..8da8e88
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/actbert_dataset.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/actbert_dataset.cpython-310.pyc
new file mode 100644
index 0000000..444ce36
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/actbert_dataset.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/asrf_dataset.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/asrf_dataset.cpython-310.pyc
new file mode 100644
index 0000000..6dd42b0
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/asrf_dataset.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ava_dataset.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ava_dataset.cpython-310.pyc
new file mode 100644
index 0000000..03146ed
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ava_dataset.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000..5264812
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/base.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/bmn_dataset.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/bmn_dataset.cpython-310.pyc
new file mode 100644
index 0000000..725e9cb
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/bmn_dataset.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/davis_dataset.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/davis_dataset.cpython-310.pyc
new file mode 100644
index 0000000..2069db9
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/davis_dataset.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/feature.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/feature.cpython-310.pyc
new file mode 100644
index 0000000..5e41573
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/feature.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/frame.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/frame.cpython-310.pyc
new file mode 100644
index 0000000..a3b9379
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/frame.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ms_tcn_dataset.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ms_tcn_dataset.cpython-310.pyc
new file mode 100644
index 0000000..470c45f
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ms_tcn_dataset.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/msrvtt.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/msrvtt.cpython-310.pyc
new file mode 100644
index 0000000..6bbe257
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/msrvtt.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/oxford.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/oxford.cpython-310.pyc
new file mode 100644
index 0000000..16d0a42
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/oxford.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/skeleton.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/skeleton.cpython-310.pyc
new file mode 100644
index 0000000..9a297c2
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/skeleton.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/slowfast_video.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/slowfast_video.cpython-310.pyc
new file mode 100644
index 0000000..32e6237
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/slowfast_video.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ucf101_skeleton.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ucf101_skeleton.cpython-310.pyc
new file mode 100644
index 0000000..2ff935a
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ucf101_skeleton.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ucf24_dataset.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ucf24_dataset.cpython-310.pyc
new file mode 100644
index 0000000..9840716
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ucf24_dataset.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/video.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/video.cpython-310.pyc
new file mode 100644
index 0000000..918e27f
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/video.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/actbert_dataset.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/actbert_dataset.py
new file mode 100644
index 0000000..8cccf5c
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/actbert_dataset.py
@@ -0,0 +1,74 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+try:
+    import lmdb
+except ImportError as e:
+    print(
+        f"Warning! {e}, [lmdb] package and it's dependencies is required for ActBERT."
+    )
+import pickle
+import json
+try:
+    from paddlenlp.transformers import BertTokenizer
+except ImportError as e:
+    print(
+        f"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT."
+    )
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class ActBertDataset(BaseDataset):
+    """ActBert dataset.
+    """
+    def __init__(
+        self,
+        file_path,
+        pipeline,
+        bert_model="bert-base-uncased",
+        data_prefix=None,
+        test_mode=False,
+    ):
+        self.bert_model = bert_model
+        super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        feature_data = np.load(self.file_path, allow_pickle=True)
+        self.tokenizer = BertTokenizer.from_pretrained(self.bert_model,
+                                                       do_lower_case=True)
+        self.info = []
+        for item in feature_data:
+            self.info.append(dict(feature=item, tokenizer=self.tokenizer))
+        return self.info
+
+    def prepare_train(self, idx):
+        """Prepare the frames for training/valid given index. """
+        results = copy.deepcopy(self.info[idx])
+        #print('==results==', results)
+        results = self.pipeline(results)
+        return results['features']
+
+    def prepare_test(self, idx):
+        """Prepare the frames for test given index. """
+        pass
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/asrf_dataset.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/asrf_dataset.py
new file mode 100644
index 0000000..15bd35a
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/asrf_dataset.py
@@ -0,0 +1,104 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+
+import os
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class ASRFDataset(BaseDataset):
+    """Video dataset for action segmentation.
+    """
+
+    def __init__(
+        self,
+        file_path,
+        pipeline,
+        feature_path,
+        label_path,
+        boundary_path,
+        **kwargs,
+    ):
+        super().__init__(file_path, pipeline, **kwargs)
+        self.label_path = label_path
+        self.boundary_path = boundary_path
+        self.feature_path = feature_path
+
+    def load_file(self):
+        """Load index file to get video information."""
+        file_ptr = open(self.file_path, 'r')
+        info = file_ptr.read().split('\n')[:-1]
+        file_ptr.close()
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID: Prepare data for training/valid given the index."""
+        results = {}
+        video_name = self.info[idx]
+        # load video feature
+        file_name = video_name.split('.')[0] + ".npy"
+        feat_file_path = os.path.join(self.feature_path, file_name)
+        #TODO: check path
+        video_feat = np.load(feat_file_path)
+
+        # load label
+        file_name = video_name.split('.')[0] + ".npy"
+        label_file_path = os.path.join(self.label_path, file_name)
+        label = np.load(label_file_path).astype(np.int64)
+
+        # load boundary
+        file_name = video_name.split('.')[0] + ".npy"
+        boundary_file_path = os.path.join(self.boundary_path, file_name)
+        boundary = np.expand_dims(np.load(boundary_file_path),axis=0).astype(np.float32)
+
+        results['video_feat'] = copy.deepcopy(video_feat)
+        results['video_label'] = copy.deepcopy(label)
+        results['video_boundary'] = copy.deepcopy(boundary)
+
+        results = self.pipeline(results)
+        return results['video_feat'], results['video_label'], results['video_boundary']
+
+    def prepare_test(self, idx):
+        """TEST: Prepare the data for test given the index."""
+        results = {}
+        video_name = self.info[idx]
+        # load video feature
+        file_name = video_name.split('.')[0] + ".npy"
+        feat_file_path = os.path.join(self.feature_path, file_name)
+        #TODO: check path
+        video_feat = np.load(feat_file_path)
+
+        # load label
+        file_name = video_name.split('.')[0] + ".npy"
+        label_file_path = os.path.join(self.label_path, file_name)
+        label = np.load(label_file_path).astype(np.int64)
+
+        # load boundary
+        file_name = video_name.split('.')[0] + ".npy"
+        boundary_file_path = os.path.join(self.boundary_path, file_name)
+        boundary = np.expand_dims(np.load(boundary_file_path),axis=0).astype(np.float32)
+
+        results['video_feat'] = copy.deepcopy(video_feat)
+        results['video_label'] = copy.deepcopy(label)
+        results['video_boundary'] = copy.deepcopy(boundary)
+
+        results = self.pipeline(results)
+        return results['video_feat'], results['video_label'], results['video_boundary']
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/ava_dataset.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/ava_dataset.py
new file mode 100644
index 0000000..744e15b
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/ava_dataset.py
@@ -0,0 +1,249 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+import sys
+import os
+import pickle
+from datetime import datetime
+from ...metrics.ava_utils import ava_evaluate_results
+from ..registry import DATASETS
+from .base import BaseDataset
+from collections import defaultdict
+
+
+@DATASETS.register()
+class AVADataset(BaseDataset):
+    """AVA dataset for spatial temporal detection.
+    the dataset loads raw frames, bounding boxes, proposals and applies
+    transformations to return the frame tensors and other information.
+    """
+
+    _FPS = 30
+
+    def __init__(self,
+                 pipeline,
+                 file_path=None,
+                 exclude_file=None,
+                 label_file=None,
+                 suffix='{:05}.jpg',
+                 proposal_file=None,
+                 person_det_score_thr=0.9,
+                 num_classes=81,
+                 data_prefix=None,
+                 test_mode=False,
+                 num_max_proposals=1000,
+                 timestamp_start=900,
+                 timestamp_end=1800):
+        self.custom_classes = None
+        self.exclude_file = exclude_file
+        self.label_file = label_file
+        self.proposal_file = proposal_file
+        assert 0 <= person_det_score_thr <= 1, (
+            'The value of '
+            'person_det_score_thr should in [0, 1]. ')
+        self.person_det_score_thr = person_det_score_thr
+        self.num_classes = num_classes
+        self.suffix = suffix
+        self.num_max_proposals = num_max_proposals
+        self.timestamp_start = timestamp_start
+        self.timestamp_end = timestamp_end
+        super().__init__(
+            file_path,
+            pipeline,
+            data_prefix,
+            test_mode,
+        )
+        if self.proposal_file is not None:
+            self.proposals = self._load(self.proposal_file)
+        else:
+            self.proposals = None
+        if not test_mode:
+            valid_indexes = self.filter_exclude_file()
+            self.info = self.info = [self.info[i] for i in valid_indexes]
+
+    def _load(self, path):
+        f = open(path, 'rb')
+        res = pickle.load(f)
+        f.close()
+        return res
+
+    def parse_img_record(self, img_records):
+        bboxes, labels, entity_ids = [], [], []
+        while len(img_records) > 0:
+            img_record = img_records[0]
+            num_img_records = len(img_records)
+            selected_records = list(
+                filter(
+                    lambda x: np.array_equal(x['entity_box'], img_record[
+                        'entity_box']), img_records))
+            num_selected_records = len(selected_records)
+            img_records = list(
+                filter(
+                    lambda x: not np.array_equal(x['entity_box'], img_record[
+                        'entity_box']), img_records))
+            assert len(img_records) + num_selected_records == num_img_records
+
+            bboxes.append(img_record['entity_box'])
+            valid_labels = np.array([
+                selected_record['label'] for selected_record in selected_records
+            ])
+
+            label = np.zeros(self.num_classes, dtype=np.float32)
+            label[valid_labels] = 1.
+
+            labels.append(label)
+            entity_ids.append(img_record['entity_id'])
+
+        bboxes = np.stack(bboxes)
+        labels = np.stack(labels)
+        entity_ids = np.stack(entity_ids)
+        return bboxes, labels, entity_ids
+
+    def filter_exclude_file(self):
+        valid_indexes = []
+        if self.exclude_file is None:
+            valid_indexes = list(range(len(self.info)))
+        else:
+            exclude_video_infos = [
+                x.strip().split(',') for x in open(self.exclude_file)
+            ]
+            for i, video_info in enumerate(self.info):
+                valid_indexes.append(i)
+                for video_id, timestamp in exclude_video_infos:
+                    if (video_info['video_id'] == video_id
+                            and video_info['timestamp'] == int(timestamp)):
+                        valid_indexes.pop()
+                        break
+        return valid_indexes
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        records_dict_by_img = defaultdict(list)
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split(',')
+
+                video_id = line_split[0]
+                timestamp = int(line_split[1])
+                img_key = f'{video_id},{timestamp:04d}'
+
+                entity_box = np.array(list(map(float, line_split[2:6])))
+                label = int(line_split[6])
+                entity_id = int(line_split[7])
+                shot_info = (0, (self.timestamp_end - self.timestamp_start) *
+                             self._FPS)
+
+                video_info = dict(video_id=video_id,
+                                  timestamp=timestamp,
+                                  entity_box=entity_box,
+                                  label=label,
+                                  entity_id=entity_id,
+                                  shot_info=shot_info)
+                records_dict_by_img[img_key].append(video_info)
+
+        for img_key in records_dict_by_img:
+            video_id, timestamp = img_key.split(',')
+            bboxes, labels, entity_ids = self.parse_img_record(
+                records_dict_by_img[img_key])
+            ann = dict(gt_bboxes=bboxes,
+                       gt_labels=labels,
+                       entity_ids=entity_ids)
+            frame_dir = video_id
+            if self.data_prefix is not None:
+                frame_dir = osp.join(self.data_prefix, frame_dir)
+            video_info = dict(frame_dir=frame_dir,
+                              video_id=video_id,
+                              timestamp=int(timestamp),
+                              img_key=img_key,
+                              shot_info=shot_info,
+                              fps=self._FPS,
+                              ann=ann)
+            info.append(video_info)
+
+        return info
+
+    def prepare_train(self, idx):
+        results = copy.deepcopy(self.info[idx])
+        img_key = results['img_key']
+
+        results['suffix'] = self.suffix
+        results['timestamp_start'] = self.timestamp_start
+        results['timestamp_end'] = self.timestamp_end
+
+        if self.proposals is not None:
+            if img_key not in self.proposals:
+                results['proposals'] = np.array([[0, 0, 1, 1]])
+                results['scores'] = np.array([1])
+            else:
+                proposals = self.proposals[img_key]
+                assert proposals.shape[-1] in [4, 5]
+                if proposals.shape[-1] == 5:
+                    thr = min(self.person_det_score_thr, max(proposals[:, 4]))
+                    positive_inds = (proposals[:, 4] >= thr)
+                    proposals = proposals[positive_inds]
+                    proposals = proposals[:self.num_max_proposals]
+                    results['proposals'] = proposals[:, :4]
+                    results['scores'] = proposals[:, 4]
+                else:
+                    proposals = proposals[:self.num_max_proposals]
+                    results['proposals'] = proposals
+
+        ann = results.pop('ann')
+        results['gt_bboxes'] = ann['gt_bboxes']
+        results['gt_labels'] = ann['gt_labels']
+        results['entity_ids'] = ann['entity_ids']
+
+        #ret = self.pipeline(results, "")
+        ret = self.pipeline(results)
+        #padding for dataloader
+        len_proposals = ret['proposals'].shape[0]
+        len_gt_bboxes = ret['gt_bboxes'].shape[0]
+        len_gt_labels = ret['gt_labels'].shape[0]
+        len_scores = ret['scores'].shape[0]
+        len_entity_ids = ret['entity_ids'].shape[0]
+        padding_len = 128
+        ret['proposals'] = self.my_padding_2d(ret['proposals'], padding_len)
+        ret['gt_bboxes'] = self.my_padding_2d(ret['gt_bboxes'], padding_len)
+        ret['gt_labels'] = self.my_padding_2d(ret['gt_labels'], padding_len)
+        ret['scores'] = self.my_padding_1d(ret['scores'], padding_len)
+        ret['entity_ids'] = self.my_padding_1d(ret['entity_ids'], padding_len)
+        return ret['imgs'][0], ret['imgs'][1], ret['proposals'], ret[
+            'gt_bboxes'], ret['gt_labels'], ret['scores'], ret[
+                'entity_ids'], np.array(
+                    ret['img_shape'], dtype=int
+                ), idx, len_proposals, len_gt_bboxes, len_gt_labels, len_scores, len_entity_ids
+
+    def my_padding_2d(self, feat, max_len):
+        feat_add = np.zeros((max_len - feat.shape[0], feat.shape[1]),
+                            dtype=np.float32)
+        feat_pad = np.concatenate((feat, feat_add), axis=0)
+        return feat_pad
+
+    def my_padding_1d(self, feat, max_len):
+        feat_add = np.zeros((max_len - feat.shape[0]), dtype=np.float32)
+        feat_pad = np.concatenate((feat, feat_add), axis=0)
+        return feat_pad
+
+    def prepare_test(self, idx):
+        return self.prepare_train(idx)
+
+    def evaluate(self, results):
+        return ava_evaluate_results(self.info, len(self), results,
+                                    self.custom_classes, self.label_file,
+                                    self.file_path, self.exclude_file)
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/base.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/base.py
new file mode 100644
index 0000000..2549dc4
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/base.py
@@ -0,0 +1,80 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import numpy as np
+from abc import ABC, abstractmethod
+
+import paddle
+from paddle.io import Dataset
+
+
+class BaseDataset(Dataset, ABC):
+    """Base class for datasets
+
+    All datasets should subclass it.
+    All subclass should overwrite:
+
+    - Method: `load_file`, load info from index file.
+    - Method: `prepare_train`, providing train data.
+    - Method: `prepare_test`, providing test data.
+
+    Args:
+        file_path (str): index file path.
+        pipeline (Sequence XXX)
+        data_prefix (str): directory path of the data. Default: None.
+        test_mode (bool): whether to build test dataset. Default: False.
+
+    """
+    def __init__(self, file_path, pipeline, data_prefix=None, test_mode=False):
+        super().__init__()
+        self.file_path = file_path
+        self.data_prefix = osp.realpath(data_prefix) if \
+            data_prefix is not None and osp.isdir(data_prefix) else data_prefix
+        self.test_mode = test_mode
+        self.pipeline = pipeline
+        self.info = self.load_file()
+
+    @abstractmethod
+    def load_file(self):
+        """load the video information from the index file path."""
+        pass
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID. Prepare the data for training/valid given the index."""
+        #Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        #unsqueeze label to list
+        return results['imgs'], np.array([results['labels']])
+
+    def prepare_test(self, idx):
+        """TEST: Prepare the data for test given the index."""
+        #Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        #unsqueeze label to list
+        return results['imgs'], np.array([results['labels']])
+
+    def __len__(self):
+        """get the size of the dataset."""
+        return len(self.info)
+
+    def __getitem__(self, idx):
+        """ Get the sample for either training or testing given index"""
+        if self.test_mode:
+            return self.prepare_test(idx)
+        else:
+            return self.prepare_train(idx)
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/bmn_dataset.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/bmn_dataset.py
new file mode 100644
index 0000000..44c7651
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/bmn_dataset.py
@@ -0,0 +1,72 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import json
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class BMNDataset(BaseDataset):
+    """Video dataset for action localization.
+    """
+    def __init__(
+        self,
+        file_path,
+        pipeline,
+        subset,
+        **kwargs,
+    ):
+        self.subset = subset
+        super().__init__(file_path, pipeline, **kwargs)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        annos = json.load(open(self.file_path))
+        for video_name in annos.keys():
+            video_subset = annos[video_name]["subset"]
+            if self.subset in video_subset:
+                info.append(
+                    dict(
+                        video_name=video_name,
+                        video_info=annos[video_name],
+                    ))
+        #sort by video_name
+        sort_f = lambda elem: elem['video_name']
+        info.sort(key=sort_f)
+        #add video_idx to info
+        for idx, elem in enumerate(info):
+            info[idx]['video_idx'] = idx
+        logger.info("{} subset video numbers: {}".format(
+            self.subset, len(info)))
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID: Prepare data for training/valid given the index."""
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        return results['video_feat'], results['gt_iou_map'], results['gt_start'],\
+               results['gt_end']
+
+    def prepare_test(self, idx):
+        """TEST: Prepare the data for test given the index."""
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        return results['video_feat'], results['gt_iou_map'], results['gt_start'], \
+               results['gt_end'], results['video_idx']
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/davis_dataset.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/davis_dataset.py
new file mode 100644
index 0000000..20a2759
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/davis_dataset.py
@@ -0,0 +1,189 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import os.path as osp
+import copy
+import random
+import numpy as np
+import shutil
+from PIL import Image
+import cv2
+from paddle.io import Dataset
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+class VOS_Test(Dataset):
+    """process frames in each video
+    """
+    def __init__(self,
+                 image_root,
+                 label_root,
+                 seq_name,
+                 images,
+                 labels,
+                 pipeline=None,
+                 rgb=False,
+                 resolution=None):
+        self.image_root = image_root
+        self.label_root = label_root
+        self.seq_name = seq_name
+        self.images = images  # image file list
+        self.labels = labels
+        self.obj_num = 1
+        self.num_frame = len(self.images)
+        self.pipeline = pipeline
+        self.rgb = rgb
+        self.resolution = resolution
+
+        self.obj_nums = []
+        temp_obj_num = 0
+        for img_name in self.images:
+            self.obj_nums.append(temp_obj_num)
+            current_label_name = img_name.split('.')[0] + '.png'
+            if current_label_name in self.labels:
+                current_label = self.read_label(current_label_name)
+                if temp_obj_num < np.unique(
+                        current_label)[-1]:  #get object number from label_id
+                    temp_obj_num = np.unique(current_label)[-1]
+
+    def __len__(self):
+        return len(self.images)
+
+    def read_image(self, idx):
+        img_name = self.images[idx]
+        img_path = os.path.join(self.image_root, self.seq_name, img_name)
+        img = cv2.imread(img_path)
+        img = np.array(img, dtype=np.float32)
+        if self.rgb:
+            img = img[:, :, [2, 1, 0]]
+        return img
+
+    def read_label(self, label_name):
+        label_path = os.path.join(self.label_root, self.seq_name, label_name)
+        label = Image.open(label_path)
+        label = np.array(label, dtype=np.uint8)
+        return label
+
+    def __getitem__(self, idx):
+        img_name = self.images[idx]
+        current_img = self.read_image(idx)
+        current_img = np.array(current_img)
+        height, width, channels = current_img.shape
+        if self.resolution is not None:
+            width = int(np.ceil(float(width) * self.resolution / float(height)))
+            height = int(self.resolution)
+
+        current_label_name = img_name.split('.')[0] + '.png'
+        obj_num = self.obj_nums[idx]
+
+        if current_label_name in self.labels:
+            current_label = self.read_label(current_label_name)
+            current_label = np.array(current_label)
+            sample = {
+                'current_img': current_img,
+                'current_label': current_label
+            }
+        else:
+            sample = {
+                'current_img': current_img
+            }  #only the first frame contains label
+
+        sample['meta'] = {
+            'seq_name': self.seq_name,
+            'frame_num': self.num_frame,
+            'obj_num': obj_num,
+            'current_name': img_name,
+            'height': height,
+            'width': width,
+            'flip': False
+        }
+        if self.pipeline is not None:
+            sample = self.pipeline(sample)
+        for s in sample:
+            s['current_img'] = np.array(s['current_img'])
+            if 'current_label' in s.keys():
+                s['current_label'] = s['current_label']
+        return sample
+
+
+@DATASETS.register()
+class DavisDataset(BaseDataset):
+    """Davis 2017 dataset.
+    """
+    def __init__(
+        self,
+        file_path,
+        result_root,
+        pipeline,
+        data_prefix=None,
+        test_mode=False,
+        year=2017,
+        rgb=False,
+        resolution='480p',
+    ):
+        self.rgb = rgb
+        self.result_root = result_root
+        self.resolution = resolution
+        self.year = year
+        self.spt = 'val' if test_mode else 'train'
+        super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+    def load_file(self):
+        self.image_root = os.path.join(self.file_path, 'JPEGImages',
+                                       self.resolution)
+        self.label_root = os.path.join(self.file_path, 'Annotations',
+                                       self.resolution)
+        seq_names = []
+        with open(
+                os.path.join(self.file_path, 'ImageSets', str(self.year),
+                             self.spt + '.txt')) as f:
+            seqs_tmp = f.readlines()
+        seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))
+        seq_names.extend(seqs_tmp)
+        self.info = list(np.unique(seq_names))
+        return self.info
+
+    def prepare_test(self, idx):
+        seq_name = self.info[idx]  #video name
+        images = list(
+            np.sort(os.listdir(os.path.join(self.image_root, seq_name))))
+        labels = [images[0].replace('jpg', 'png')]  #we have first frame target
+
+        # copy first frame target
+        if not os.path.isfile(
+                os.path.join(self.result_root, seq_name, labels[0])):
+            if not os.path.exists(os.path.join(self.result_root, seq_name)):
+                os.makedirs(os.path.join(self.result_root, seq_name))
+            source_label_path = os.path.join(self.label_root, seq_name,
+                                             labels[0])
+            result_label_path = os.path.join(self.result_root, seq_name,
+                                             labels[0])
+
+            shutil.copy(source_label_path, result_label_path)
+
+        seq_dataset = VOS_Test(self.image_root,
+                               self.label_root,
+                               seq_name,
+                               images,
+                               labels,
+                               self.pipeline,
+                               rgb=self.rgb,
+                               resolution=480)
+        return seq_dataset
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/feature.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/feature.py
new file mode 100644
index 0000000..df5e33e
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/feature.py
@@ -0,0 +1,80 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import os.path as osp
+
+from ..registry import DATASETS
+from .base import BaseDataset
+
+
+@DATASETS.register()
+class FeatureDataset(BaseDataset):
+    """Feature dataset for action recognition
+       Example:(TODO)
+       Args:(TODO)
+    """
+    def __init__(
+        self,
+        file_path,
+        pipeline,
+        data_prefix=None,
+        test_mode=False,
+        suffix=None,
+    ):
+        self.suffix = suffix
+        super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                filename = line.strip().split()[0]
+                if self.data_prefix is not None:
+                    filename = osp.join(self.data_prefix, filename)
+                if self.suffix is not None:
+                    filename = filename + self.suffix
+
+                info.append(dict(filename=filename))
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID. Prepare the data for training/valid given the index."""
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+
+        if 'iou_norm' in results:
+            return results['rgb_data'], results['rgb_len'], results[
+                'rgb_mask'], results['audio_data'], results[
+                    'audio_len'], results['audio_mask'], results[
+                        'labels'], results['iou_norm']
+        else:
+            return results['rgb_data'], results['rgb_len'], results[
+                'rgb_mask'], results['audio_data'], results[
+                    'audio_len'], results['audio_mask'], results['labels']
+
+    def prepare_test(self, idx):
+        """TEST. Prepare the data for testing given the index."""
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+
+        if 'iou_norm' in results:
+            return results['rgb_data'], results['rgb_len'], results[
+                'rgb_mask'], results['audio_data'], results[
+                    'audio_len'], results['audio_mask'], results[
+                        'labels'], results['iou_norm']
+        else:
+            return results['rgb_data'], results['rgb_len'], results[
+                'rgb_mask'], results['audio_data'], results[
+                    'audio_len'], results['audio_mask'], results['labels']
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/frame.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/frame.py
new file mode 100644
index 0000000..b02f526
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/frame.py
@@ -0,0 +1,177 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class FrameDataset(BaseDataset):
+    """Rawframe dataset for action recognition.
+    The dataset loads raw frames from frame files, and apply specified transform operatation them.
+    The indecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.
+    Example of an index file:
+
+    .. code-block:: txt
+
+        file_path-1 150 1
+        file_path-2 160 1
+        file_path-3 170 2
+        file_path-4 180 2
+
+    Args:
+        file_path (str): Path to the index file.
+        pipeline(XXX):
+        data_prefix (str): directory path of the data. Default: None.
+        test_mode (bool): Whether to bulid the test dataset. Default: False.
+        suffix (str): suffix of file. Default: 'img_{:05}.jpg'.
+
+    """
+    def __init__(self,
+                 file_path,
+                 pipeline,
+                 num_retries=5,
+                 data_prefix=None,
+                 test_mode=False,
+                 suffix='img_{:05}.jpg'):
+        self.num_retries = num_retries
+        self.suffix = suffix
+        super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split()
+                frame_dir, frames_len, labels = line_split
+                if self.data_prefix is not None:
+                    frame_dir = osp.join(self.data_prefix, frame_dir)
+                info.append(
+                    dict(frame_dir=frame_dir,
+                         suffix=self.suffix,
+                         frames_len=frames_len,
+                         labels=int(labels)))
+        return info
+
+    def prepare_train(self, idx):
+        """Prepare the frames for training/valid given index. """
+        #Try to catch Exception caused by reading missing frames files
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['frame_dir'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'], np.array([results['labels']])
+
+    def prepare_test(self, idx):
+        """Prepare the frames for test given index. """
+        #Try to catch Exception caused by reading missing frames files
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['frame_dir'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'], np.array([results['labels']])
+
+
+@DATASETS.register()
+class FrameDataset_Sport(BaseDataset):
+    """Video dataset for action recognition
+       The dataset loads raw videos and apply specified transforms on them.
+       The index file is a file with multiple lines, and each line indicates
+       a sample video with the filepath and label, which are split with a whitesapce.
+       Example of a inde file:
+       .. code-block:: txt
+           path/000.mp4 1
+           path/001.mp4 1
+           path/002.mp4 2
+           path/003.mp4 2
+       Args:
+           file_path(str): Path to the index file.
+           pipeline(XXX): A sequence of data transforms.
+           **kwargs: Keyword arguments for ```BaseDataset```.
+    """
+    def __init__(self, file_path, pipeline, num_retries=5, suffix='', **kwargs):
+        self.num_retries = num_retries
+        self.suffix = suffix
+        super().__init__(file_path, pipeline, **kwargs)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split()
+                frame_dir = line_split[0]
+                if self.data_prefix is not None:
+                    frame_dir = osp.join(self.data_prefix, frame_dir)
+                info.append(dict(frame_dir=frame_dir, suffix=self.suffix))
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID. Prepare the data for training/valid given the index."""
+        #Try to catch Exception caused by reading corrupted video file
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['filename'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'], np.array([results['labels']])
+
+    def prepare_test(self, idx):
+        """TEST. Prepare the data for test given the index."""
+        #Try to catch Exception caused by reading corrupted video file
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['filename'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'], np.array([results['labels']])
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/ms_tcn_dataset.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/ms_tcn_dataset.py
new file mode 100644
index 0000000..56e3b7b
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/ms_tcn_dataset.py
@@ -0,0 +1,110 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+
+import os
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class MSTCNDataset(BaseDataset):
+    """Video dataset for action segmentation.
+    """
+
+    def __init__(
+        self,
+        file_path,
+        pipeline,
+        feature_path,
+        gt_path,
+        actions_map_file_path,
+        **kwargs,
+    ):
+        super().__init__(file_path, pipeline, **kwargs)
+        self.gt_path = gt_path
+        self.actions_map_file_path = actions_map_file_path
+        self.feature_path = feature_path
+
+        # actions dict generate
+        file_ptr = open(self.actions_map_file_path, 'r')
+        actions = file_ptr.read().split('\n')[:-1]
+        file_ptr.close()
+        self.actions_dict = dict()
+        for a in actions:
+            self.actions_dict[a.split()[1]] = int(a.split()[0])
+
+        self.num_classes = len(self.actions_dict.keys())
+
+    def load_file(self):
+        """Load index file to get video information."""
+        file_ptr = open(self.file_path, 'r')
+        info = file_ptr.read().split('\n')[:-1]
+        file_ptr.close()
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID: Prepare data for training/valid given the index."""
+        results = {}
+        video_name = self.info[idx]
+        # load video feature
+        file_name = video_name.split('.')[0] + ".npy"
+        feat_file_path = os.path.join(self.feature_path, file_name)
+        #TODO: check path
+        video_feat = np.load(feat_file_path)
+
+        # load label
+        target_file_path = os.path.join(self.gt_path, video_name)
+        file_ptr = open(target_file_path, 'r')
+        content = file_ptr.read().split('\n')[:-1]
+        classes = np.zeros(min(np.shape(video_feat)[1], len(content)), dtype='int64')
+        for i in range(len(classes)):
+            classes[i] = self.actions_dict[content[i]]
+        # classes = classes * (-100)
+
+        results['video_feat'] = copy.deepcopy(video_feat)
+        results['video_gt'] = copy.deepcopy(classes)
+
+        results = self.pipeline(results)
+        return results['video_feat'], results['video_gt']
+
+    def prepare_test(self, idx):
+        """TEST: Prepare the data for test given the index."""
+        results = {}
+        video_name = self.info[idx]
+        # load video feature
+        file_name = video_name.split('.')[0] + ".npy"
+        feat_file_path = os.path.join(self.feature_path, file_name)
+        #TODO: check path
+        video_feat = np.load(feat_file_path)
+
+        # load label
+        target_file_path = os.path.join(self.gt_path, video_name)
+        file_ptr = open(target_file_path, 'r')
+        content = file_ptr.read().split('\n')[:-1]
+        classes = np.zeros(min(np.shape(video_feat)[1], len(content)))
+        for i in range(len(classes)):
+            classes[i] = self.actions_dict[content[i]]
+        # classes = classes * (-100)
+
+        results['video_feat'] = copy.deepcopy(video_feat)
+        results['video_gt'] = copy.deepcopy(classes)
+
+        results = self.pipeline(results)
+        return results['video_feat'], results['video_gt']
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/msrvtt.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/msrvtt.py
new file mode 100644
index 0000000..0e5294f
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/msrvtt.py
@@ -0,0 +1,220 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+try:
+    import lmdb
+except ImportError as e:
+    print(
+        f"Warning! {e}, [lmdb] package and it's dependencies is required for ActBERT."
+    )
+import pickle
+try:
+    from paddlenlp.transformers import BertTokenizer
+except ImportError as e:
+    print(
+        f"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT."
+    )
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class MSRVTTDataset(BaseDataset):
+    """MSR-VTT dataset for text-video clip retrieval.
+    """
+    def __init__(
+        self,
+        file_path,
+        pipeline,
+        features_path,
+        bert_model="bert-base-uncased",
+        padding_index=0,
+        max_seq_length=36,
+        max_region_num=36,
+        max_action_num=5,
+        vision_feature_dim=2048,
+        action_feature_dim=2048,
+        spatials_dim=5,
+        data_prefix=None,
+        test_mode=False,
+    ):
+        self.features_path = features_path
+        self.bert_model = bert_model
+        self.padding_index = padding_index
+        self.max_seq_length = max_seq_length
+        self.max_region_num = max_region_num
+        self._max_action_num = max_action_num
+        self.vision_feature_dim = vision_feature_dim
+        self.action_feature_dim = action_feature_dim
+        self.spatials_dim = spatials_dim
+        self._tokenizer = BertTokenizer.from_pretrained(bert_model,
+                                                        do_lower_case=True)
+        super().__init__(file_path, pipeline, data_prefix, test_mode)
+        self.tokenize()
+        self.gen_feature()
+
+    def load_file(self):
+        """Load index file to get video information."""
+        with open(self.file_path) as fin:
+            self.image_entries = []
+            self.caption_entries = []
+            for line in fin.readlines():
+                line = line.strip()
+                vid_id = line.split(',')[0]
+                self.image_entries.append(vid_id)
+                self.caption_entries.append({
+                    "caption": line.split(',')[1],
+                    "vid_id": vid_id
+                })
+        self.env = lmdb.open(self.features_path)
+
+    def tokenize(self):
+        for entry in self.caption_entries:
+            tokens = []
+            tokens.append("[CLS]")
+            for token in self._tokenizer.tokenize(entry["caption"]):
+                tokens.append(token)
+            tokens.append("[SEP]")
+            tokens = self._tokenizer.convert_tokens_to_ids(tokens)
+
+            segment_ids = [0] * len(tokens)
+            input_mask = [1] * len(tokens)
+
+            if len(tokens) < self.max_seq_length:
+                padding = [self.padding_index
+                           ] * (self.max_seq_length - len(tokens))
+                tokens = tokens + padding
+                input_mask += padding
+                segment_ids += padding
+
+            entry["token"] = np.array(tokens).astype('int64')
+            entry["input_mask"] = np.array(input_mask)
+            entry["segment_ids"] = np.array(segment_ids).astype('int64')
+
+    def get_image_feature(self, video_id):
+        video_id = str(video_id).encode()
+        with self.env.begin(write=False) as txn:
+            item = pickle.loads(txn.get(video_id))
+            video_id = item["video_id"]
+            image_h = int(item["image_h"])
+            image_w = int(item["image_w"])
+
+            features = item["features"].reshape(-1, self.vision_feature_dim)
+            boxes = item["boxes"].reshape(-1, 4)
+
+            num_boxes = features.shape[0]
+            g_feat = np.sum(features, axis=0) / num_boxes
+            num_boxes = num_boxes + 1
+            features = np.concatenate(
+                [np.expand_dims(g_feat, axis=0), features], axis=0)
+
+            action_features = item["action_features"].reshape(
+                -1, self.action_feature_dim)
+
+            image_location = np.zeros((boxes.shape[0], self.spatials_dim),
+                                      dtype=np.float32)
+            image_location[:, :4] = boxes
+            image_location[:,
+                           4] = ((image_location[:, 3] - image_location[:, 1]) *
+                                 (image_location[:, 2] - image_location[:, 0]) /
+                                 (float(image_w) * float(image_h)))
+
+            image_location[:, 0] = image_location[:, 0] / float(image_w)
+            image_location[:, 1] = image_location[:, 1] / float(image_h)
+            image_location[:, 2] = image_location[:, 2] / float(image_w)
+            image_location[:, 3] = image_location[:, 3] / float(image_h)
+
+            g_location = np.array([0, 0, 1, 1, 1])
+            image_location = np.concatenate(
+                [np.expand_dims(g_location, axis=0), image_location], axis=0)
+        return features, num_boxes, image_location, action_features
+
+    def gen_feature(self):
+        num_inst = len(self.image_entries)  #1000
+        self.features_all = np.zeros(
+            (num_inst, self.max_region_num, self.vision_feature_dim))
+        self.action_features_all = np.zeros(
+            (num_inst, self._max_action_num, self.action_feature_dim))
+        self.spatials_all = np.zeros(
+            (num_inst, self.max_region_num, self.spatials_dim))
+        self.image_mask_all = np.zeros((num_inst, self.max_region_num))
+        self.action_mask_all = np.zeros((num_inst, self._max_action_num))
+
+        for i, image_id in enumerate(self.image_entries):
+            features, num_boxes, boxes, action_features = self.get_image_feature(
+                image_id)
+
+            mix_num_boxes = min(int(num_boxes), self.max_region_num)
+            mix_boxes_pad = np.zeros((self.max_region_num, self.spatials_dim))
+            mix_features_pad = np.zeros(
+                (self.max_region_num, self.vision_feature_dim))
+
+            image_mask = [1] * (int(mix_num_boxes))
+            while len(image_mask) < self.max_region_num:
+                image_mask.append(0)
+            action_mask = [1] * (self._max_action_num)
+            while len(action_mask) < self._max_action_num:
+                action_mask.append(0)
+
+            mix_boxes_pad[:mix_num_boxes] = boxes[:mix_num_boxes]
+            mix_features_pad[:mix_num_boxes] = features[:mix_num_boxes]
+
+            self.features_all[i] = mix_features_pad
+            x = action_features.shape[0]
+            self.action_features_all[i][:x] = action_features[:]
+            self.image_mask_all[i] = np.array(image_mask)
+            self.action_mask_all[i] = np.array(action_mask)
+            self.spatials_all[i] = mix_boxes_pad
+
+        self.features_all = self.features_all.astype("float32")
+        self.action_features_all = self.action_features_all.astype("float32")
+        self.image_mask_all = self.image_mask_all.astype("int64")
+        self.action_mask_all = self.action_mask_all.astype("int64")
+        self.spatials_all = self.spatials_all.astype("float32")
+
+    def prepare_train(self, idx):
+        pass
+
+    def prepare_test(self, idx):
+        entry = self.caption_entries[idx]
+        caption = entry["token"]
+        input_mask = entry["input_mask"]
+        segment_ids = entry["segment_ids"]
+
+        target_all = np.zeros(1000)
+        for i, image_id in enumerate(self.image_entries):
+            if image_id == entry["vid_id"]:
+                target_all[i] = 1
+
+        return (
+            caption,
+            self.action_features_all,
+            self.features_all,
+            self.spatials_all,
+            segment_ids,
+            input_mask,
+            self.image_mask_all,
+            self.action_mask_all,
+            target_all,
+        )
+
+    def __len__(self):
+        return len(self.caption_entries)
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/oxford.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/oxford.py
new file mode 100644
index 0000000..a9e65c6
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/oxford.py
@@ -0,0 +1,62 @@
+# Copyright Niantic 2019. Patent Pending. All rights reserved.
+#
+# This software is licensed under the terms of the Monodepth2 licence
+# which allows for non-commercial use only, the full terms of which are made
+# available in the LICENSE file.
+
+from __future__ import absolute_import, division, print_function
+
+import copy
+from os import path as osp
+
+from PIL import Image
+
+from ..registry import DATASETS
+from .base import BaseDataset
+
+
+def pil_loader(path):
+    # open path as file to avoid ResourceWarning
+    # (https://github.com/python-pillow/Pillow/issues/835)
+    with open(path, 'rb') as f:
+        with Image.open(f) as img:
+            return img.convert('RGB')
+
+
+@DATASETS.register()
+class MonoDataset(BaseDataset):
+    def __init__(self,
+                 file_path,
+                 data_prefix,
+                 pipeline,
+                 num_retries=0,
+                 suffix='.png',
+                 **kwargs):
+        self.num_retries = num_retries
+        self.suffix = suffix
+        super().__init__(file_path, pipeline, data_prefix, **kwargs)
+
+    def load_file(self):
+        info = []
+        with open(self.file_path, 'r') as f:
+            for line in f:
+                filename = line.strip() + self.suffix
+                folder = osp.dirname(filename)
+                frame_index = line.strip().split('/')[1]
+                info.append(
+                    dict(data_path=self.data_prefix,
+                         filename=filename,
+                         folder=folder,
+                         frame_index=int(frame_index)))
+        return info
+
+    def prepare_train(self, idx):
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        results['imgs']['idx'] = idx
+        return results['imgs'], results['day_or_night']
+
+    def prepare_test(self, idx):
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        return results['imgs'], results['day_or_night']
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/skeleton.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/skeleton.py
new file mode 100644
index 0000000..30a3f3e
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/skeleton.py
@@ -0,0 +1,78 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+import pickle
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class SkeletonDataset(BaseDataset):
+    """
+    Skeleton dataset for action recognition.
+    The dataset loads skeleton feature, and apply norm operatations.
+    Args:
+        file_path (str): Path to the index file.
+        pipeline(obj): Define the pipeline of data preprocessing.
+        data_prefix (str): directory path of the data. Default: None.
+        test_mode (bool): Whether to bulid the test dataset. Default: False.
+    """
+    def __init__(self, file_path, pipeline, label_path=None, test_mode=False):
+        self.label_path = label_path
+        super().__init__(file_path, pipeline, test_mode=test_mode)
+
+    def load_file(self):
+        """Load feature file to get skeleton information."""
+        logger.info("Loading data, it will take some moment...")
+        self.data = np.load(self.file_path)
+        if self.label_path:
+            if self.label_path.endswith('npy'):
+                self.label = np.load(self.label_path)
+            elif self.label_path.endswith('pkl'):
+                with open(self.label_path, 'rb') as f:
+                    sample_name, self.label = pickle.load(f)
+        else:
+            logger.info(
+                "Label path not provided when test_mode={}, here just output predictions."
+                .format(self.test_mode))
+        logger.info("Data Loaded!")
+        return self.data  # used for __len__
+
+    def prepare_train(self, idx):
+        """Prepare the feature for training/valid given index. """
+        results = dict()
+        results['data'] = copy.deepcopy(self.data[idx])
+        results['label'] = copy.deepcopy(self.label[idx])
+        results = self.pipeline(results)
+        return results['data'], results['label']
+
+    def prepare_test(self, idx):
+        """Prepare the feature for test given index. """
+        results = dict()
+        results['data'] = copy.deepcopy(self.data[idx])
+        if self.label_path:
+            results['label'] = copy.deepcopy(self.label[idx])
+            results = self.pipeline(results)
+            return results['data'], results['label']
+        else:
+            results = self.pipeline(results)
+            return [results['data']]
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/slowfast_video.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/slowfast_video.py
new file mode 100644
index 0000000..1adf89c
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/slowfast_video.py
@@ -0,0 +1,143 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+@DATASETS.register()
+class SFVideoDataset(BaseDataset):
+    """Video dataset for action recognition
+       The dataset loads raw videos and apply specified transforms on them.
+
+       The index file is a file with multiple lines, and each line indicates
+       a sample video with the filepath and label, which are split with a whitesapce.
+       Example of a inde file:
+
+       .. code-block:: txt
+
+           path/000.mp4 1
+           path/001.mp4 1
+           path/002.mp4 2
+           path/003.mp4 2
+
+       Args:
+           file_path(str): Path to the index file.
+           pipeline(XXX): A sequence of data transforms.
+           num_ensemble_views(int): temporal segment when multi-crop test
+           num_spatial_crops(int): spatial crop number when multi-crop test
+           **kwargs: Keyword arguments for ```BaseDataset```.
+
+    """
+    def __init__(
+        self,
+        file_path,
+        pipeline,
+        num_ensemble_views=1,
+        num_spatial_crops=1,
+        num_retries=5,
+        num_samples_precise_bn=None,
+        **kwargs,
+    ):
+        self.num_ensemble_views = num_ensemble_views
+        self.num_spatial_crops = num_spatial_crops
+        self.num_retries = num_retries
+        self.num_samples_precise_bn = num_samples_precise_bn
+        super().__init__(file_path, pipeline, **kwargs)
+        #set random seed
+        random.seed(0)
+        np.random.seed(0)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split()
+                filename, labels = line_split
+                if self.data_prefix is not None:
+                    filename = osp.join(self.data_prefix, filename)
+                for tidx in range(self.num_ensemble_views):
+                    for sidx in range(self.num_spatial_crops):
+                        info.append(
+                            dict(
+                                filename=filename,
+                                labels=int(labels),
+                                temporal_sample_index=tidx,
+                                spatial_sample_index=sidx,
+                                temporal_num_clips=self.num_ensemble_views,
+                                spatial_num_clips=self.num_spatial_crops,
+                            ))
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID. Prepare the data for training given the index."""
+        #Try to catch Exception caused by reading corrupted video file
+        short_cycle = False
+        if isinstance(idx, tuple):
+            idx, short_cycle_idx = idx
+            short_cycle = True
+        for ir in range(self.num_retries):
+            try:
+                #Multi-grid short cycle
+                if short_cycle:
+                    results = copy.deepcopy(self.info[idx])
+                    results['short_cycle_idx'] = short_cycle_idx
+                else:
+                    results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['filename'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+
+            return results['imgs'][0], results['imgs'][1], np.array(
+                [results['labels']])
+
+    def prepare_test(self, idx):
+        """TEST. Prepare the data for test given the index."""
+        #Try to catch Exception caused by reading corrupted video file
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['filename'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'][0], results['imgs'][1], np.array(
+                [results['labels']]), np.array([idx])
+
+    def __len__(self):
+        """get the size of the dataset."""
+        if self.num_samples_precise_bn is None:
+            return len(self.info)
+        else:
+            random.shuffle(self.info)
+            return min(self.num_samples_precise_bn, len(self.info))
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/ucf101_skeleton.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/ucf101_skeleton.py
new file mode 100644
index 0000000..8177933
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/ucf101_skeleton.py
@@ -0,0 +1,89 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+import pickle
+
+import paddle
+from paddle.io import Dataset
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class UCF101SkeletonDataset(BaseDataset):
+    """
+    Skeleton dataset for action recognition.
+    The dataset loads skeleton feature, and apply norm operatations.
+    Args:
+        file_path (str): Path to the index file.
+        pipeline(obj): Define the pipeline of data preprocessing.
+        test_mode (bool): Whether to bulid the test dataset. Default: False.
+    """
+
+    def __init__(self,
+                 file_path,
+                 pipeline,
+                 split,
+                 repeat_times,
+                 test_mode=False):
+        self.split = split
+        self.repeat_times = repeat_times
+        super().__init__(file_path, pipeline, test_mode=test_mode)
+        self._ori_len = len(self.info)
+        self.start_index = 0
+        self.modality = "Pose"
+
+    def load_file(self):
+        """Load annotation file to get video information."""
+        assert self.file_path.endswith('.pkl')
+        return self.load_pkl_annotations()
+
+    def load_pkl_annotations(self):
+        with open(self.file_path, "rb") as f:
+            data = pickle.load(f)
+
+        if self.split:
+            split, data = data['split'], data['annotations']
+            identifier = 'filename' if 'filename' in data[0] else 'frame_dir'
+            data = [x for x in data if x[identifier] in split[self.split]]
+
+        return data
+
+    def prepare_train(self, idx):
+        """Prepare the frames for training given the index."""
+        results = copy.deepcopy(self.info[idx % self._ori_len])
+        results['modality'] = self.modality
+        results['start_index'] = self.start_index
+
+        return self.pipeline(results)
+
+    def prepare_test(self, idx):
+        """Prepare the frames for testing given the index."""
+        results = copy.deepcopy(self.info[idx % self._ori_len])
+        results['modality'] = self.modality
+        results['start_index'] = self.start_index
+
+        return self.pipeline(results)
+
+    def __len__(self):
+        """get the size of the dataset."""
+        return len(self.info) * self.repeat_times
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/ucf24_dataset.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/ucf24_dataset.py
new file mode 100644
index 0000000..ad2e84e
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/ucf24_dataset.py
@@ -0,0 +1,76 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class UCF24Dataset(BaseDataset):
+    """Dataset for YOWO
+       The dataset loads raw videos and apply specified transforms on them.
+       The index file is a file with multiple lines, and each line indicates
+       a sample video with the filepath and label, which are split with a whitesapce.
+       Example of a inde file:
+       .. code-block:: txt
+
+       Args:
+           file_path(str): Path to the index file.
+           pipeline(XXX): A sequence of data transforms.
+           **kwargs: Keyword arguments for ```BaseDataset```.
+    """
+
+    def __init__(self, file_path, pipeline, num_retries=5, **kwargs):
+        self.num_retries = num_retries
+        super().__init__(file_path, pipeline, **kwargs)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            lines = fin.readlines()
+        for line in lines:
+            line = line.strip()  # 'data/ucf24/labels/class_name/video_name/key_frame.txt'
+            filename = line.replace('txt', 'jpg').replace(
+                'labels', 'rgb-images')  # key frame path
+
+            info.append(dict(filename=filename))
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID. Prepare the data for training/valid given the index."""
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        im_path = results['filename']
+        im_path = im_path.replace('jpg', 'txt')
+        im_split = im_path.split('/')
+        frame_index = im_split[3] + '_' + im_split[4] + '_' + im_split[5]
+        return results['imgs'], np.array([results['labels']]), frame_index
+
+    def prepare_test(self, idx):
+        """TEST. Prepare the data for test given the index."""
+        # Try to catch Exception caused by reading corrupted video file
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        im_path = results['filename']
+        im_path = im_path.replace('jpg', 'txt')
+        im_split = im_path.split('/')
+        frame_index = im_split[3] + '_' + im_split[4] + '_' + im_split[5]
+        return results['imgs'], np.array([results['labels']]), frame_index
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/video.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/video.py
new file mode 100644
index 0000000..f2d8f89
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/video.py
@@ -0,0 +1,95 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class VideoDataset(BaseDataset):
+    """Video dataset for action recognition
+       The dataset loads raw videos and apply specified transforms on them.
+       The index file is a file with multiple lines, and each line indicates
+       a sample video with the filepath and label, which are split with a whitesapce.
+       Example of a inde file:
+       .. code-block:: txt
+           path/000.mp4 1
+           path/001.mp4 1
+           path/002.mp4 2
+           path/003.mp4 2
+       Args:
+           file_path(str): Path to the index file.
+           pipeline(XXX): A sequence of data transforms.
+           **kwargs: Keyword arguments for ```BaseDataset```.
+    """
+    def __init__(self, file_path, pipeline, num_retries=5, suffix='', **kwargs):
+        self.num_retries = num_retries
+        self.suffix = suffix
+        super().__init__(file_path, pipeline, **kwargs)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split()
+                filename, labels = line_split
+                #TODO(hj): Required suffix format: may mp4/avi/wmv
+                filename = filename + self.suffix
+                if self.data_prefix is not None:
+                    filename = osp.join(self.data_prefix, filename)
+                info.append(dict(filename=filename, labels=int(labels)))
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID. Prepare the data for training/valid given the index."""
+        #Try to catch Exception caused by reading corrupted video file
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['filename'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'], np.array([results['labels']])
+
+    def prepare_test(self, idx):
+        """TEST. Prepare the data for test given the index."""
+        #Try to catch Exception caused by reading corrupted video file
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['filename'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'], np.array([results['labels']])
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__init__.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__init__.py
new file mode 100644
index 0000000..6e6afdc
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__init__.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .anet_pipeline import GetMatchMap, GetVideoLabel, LoadFeat
+from .augmentations import (CenterCrop, ColorJitter, GroupRandomFlip,
+                            GroupResize, Image2Array, JitterScale, MultiCrop,
+                            Normalization, PackOutput, RandomCrop, RandomFlip,
+                            RandomResizedCrop, Scale, TenCrop, ToArray,
+                            UniformCrop, RandomGamma, MultiCenterCrop,
+                            RandomBrightness, RandomHue, RandomSaturation, YowoAug)
+from .augmentations_ava import *
+from .compose import Compose
+from .decode import FeatureDecoder, FrameDecoder, VideoDecoder, ActionFeatureDecoder
+from .decode_image import ImageDecoder
+from .decode_sampler import DecodeSampler
+from .mix import Cutmix, Mixup, VideoMix
+from .multimodal import FeaturePadding, RandomCap, RandomMask, Tokenize
+from .sample import Sampler, SamplerPkl
+from .sample_ava import *
+from .segmentation import MultiNorm, MultiRestrictSize
+from .skeleton_pipeline import AutoPadding, Iden, SkeletonNorm
+from .skeleton_pipeline import SketeonCropSample, SketeonModalityTransform, RandomRotation
+from .skeleton_pipeline import (UniformSampleFrames, PoseDecode, PoseCompact,
+                                RandomResizedCrop_V2, Flip_V2, CenterCrop_V2,
+                                GeneratePoseTarget, FormatShape, Collect)
+from .decode_sampler_MRI import SFMRI_DecodeSampler
+from .segmentation_pipline import SegmentationSampler
+from .sample_ucf24 import SamplerUCF24
+
+__all__ = [
+    'ImageDecoder', 'RandomMask', 'UniformCrop', 'SkeletonNorm', 'Tokenize',
+    'Sampler', 'FeatureDecoder', 'DecodeSampler', 'TenCrop', 'Compose',
+    'AutoPadding', 'Normalization', 'Mixup', 'Image2Array', 'Scale',
+    'GroupResize', 'VideoDecoder', 'FrameDecoder', 'PackOutput',
+    'ActionFeatureDecoder', 'GetVideoLabel', 'Cutmix', 'CenterCrop',
+    'RandomCrop', 'LoadFeat', 'RandomCap', 'JitterScale', 'Iden', 'VideoMix',
+    'ColorJitter', 'RandomFlip', 'ToArray', 'FeaturePadding', 'GetMatchMap',
+    'GroupRandomFlip', 'MultiCrop', 'SFMRI_DecodeSampler', 'MultiRestrictSize',
+    'MultiNorm', 'RandomResizedCrop', 'SamplerPkl', 'SegmentationSampler',
+    'SketeonCropSample', 'SketeonModalityTransform', 'RandomRotation',
+    'RandomGamma', 'MultiCenterCrop', 'RandomBrightness', 'RandomHue',
+    'RandomSaturation', 'UniformSampleFrames', 'PoseDecode', 'PoseCompact',
+    'Resize', 'RandomResizedCrop_V2', 'Flip_V2', 'GeneratePoseTarget',
+    'FormatShape', 'Collect', 'RandomSaturation', 'SamplerUCF24', 'YowoAug'
+]
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..303f568
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/anet_pipeline.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/anet_pipeline.cpython-310.pyc
new file mode 100644
index 0000000..5e4b7eb
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/anet_pipeline.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/augmentations.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/augmentations.cpython-310.pyc
new file mode 100644
index 0000000..d785703
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/augmentations.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/augmentations_ava.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/augmentations_ava.cpython-310.pyc
new file mode 100644
index 0000000..ce471b7
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/augmentations_ava.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/compose.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/compose.cpython-310.pyc
new file mode 100644
index 0000000..8a2e06c
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/compose.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode.cpython-310.pyc
new file mode 100644
index 0000000..72e5884
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode_image.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode_image.cpython-310.pyc
new file mode 100644
index 0000000..ddf1ab1
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode_image.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode_sampler.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode_sampler.cpython-310.pyc
new file mode 100644
index 0000000..99b86d9
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode_sampler.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode_sampler_MRI.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode_sampler_MRI.cpython-310.pyc
new file mode 100644
index 0000000..17917fa
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode_sampler_MRI.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/mix.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/mix.cpython-310.pyc
new file mode 100644
index 0000000..0d47a42
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/mix.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/multimodal.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/multimodal.cpython-310.pyc
new file mode 100644
index 0000000..fcf9da5
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/multimodal.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/sample.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/sample.cpython-310.pyc
new file mode 100644
index 0000000..b90beeb
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/sample.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/sample_ava.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/sample_ava.cpython-310.pyc
new file mode 100644
index 0000000..0187f00
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/sample_ava.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/sample_ucf24.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/sample_ucf24.cpython-310.pyc
new file mode 100644
index 0000000..0cf5b4f
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/sample_ucf24.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/segmentation.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/segmentation.cpython-310.pyc
new file mode 100644
index 0000000..7dd91b3
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/segmentation.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/segmentation_pipline.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/segmentation_pipline.cpython-310.pyc
new file mode 100644
index 0000000..00d87e4
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/segmentation_pipline.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/skeleton_pipeline.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/skeleton_pipeline.cpython-310.pyc
new file mode 100644
index 0000000..e4ddc34
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/skeleton_pipeline.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/anet_pipeline.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/anet_pipeline.py
new file mode 100644
index 0000000..210d733
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/anet_pipeline.py
@@ -0,0 +1,150 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+from ..registry import PIPELINES
+"""pipeline ops for Activity Net.
+"""
+
+
+@PIPELINES.register()
+class LoadFeat(object):
+    def __init__(self, feat_path):
+        self.feat_path = feat_path
+
+    def __call__(self, results):
+        video_name = results['video_name']
+        file_name = video_name + ".npy"
+        file_path = os.path.join(self.feat_path, file_name)
+        #TODO: check path
+        video_feat = np.load(file_path)
+        video_feat = video_feat.T
+        video_feat = video_feat.astype("float32")
+        results['video_feat'] = video_feat
+        return results
+
+
+@PIPELINES.register()
+class GetMatchMap(object):
+    def __init__(self, tscale):
+        self.tscale = tscale
+        self.tgap = 1. / self.tscale
+
+    def __call__(self, results):
+        match_map = []
+        for idx in range(self.tscale):
+            tmp_match_window = []
+            xmin = self.tgap * idx
+            for jdx in range(1, self.tscale + 1):
+                xmax = xmin + self.tgap * jdx
+                tmp_match_window.append([xmin, xmax])
+            match_map.append(tmp_match_window)
+        match_map = np.array(match_map)
+        match_map = np.transpose(match_map, [1, 0, 2])
+        match_map = np.reshape(match_map, [-1, 2])
+
+        anchor_xmin = [self.tgap * i for i in range(self.tscale)]
+        anchor_xmax = [self.tgap * i for i in range(1, self.tscale + 1)]
+
+        results['match_map'] = match_map
+        results['anchor_xmin'] = anchor_xmin
+        results['anchor_xmax'] = anchor_xmax
+        return results
+
+
+@PIPELINES.register()
+class GetVideoLabel(object):
+    def __init__(self, tscale, dscale, datatype="float32"):
+        self.tscale = tscale
+        self.dscale = dscale
+        self.tgap = 1. / self.tscale
+        self.datatype = datatype
+
+    def iou_with_anchors(self, anchors_min, anchors_max, box_min, box_max):
+        """Compute jaccard score between a box and the anchors.
+        """
+        len_anchors = anchors_max - anchors_min
+        int_xmin = np.maximum(anchors_min, box_min)
+        int_xmax = np.minimum(anchors_max, box_max)
+        inter_len = np.maximum(int_xmax - int_xmin, 0.)
+        union_len = len_anchors - inter_len + box_max - box_min
+        jaccard = np.divide(inter_len, union_len)
+        return jaccard
+
+    def ioa_with_anchors(self, anchors_min, anchors_max, box_min, box_max):
+        """Compute intersection between score a box and the anchors.
+        """
+        len_anchors = anchors_max - anchors_min
+        int_xmin = np.maximum(anchors_min, box_min)
+        int_xmax = np.minimum(anchors_max, box_max)
+        inter_len = np.maximum(int_xmax - int_xmin, 0.)
+        scores = np.divide(inter_len, len_anchors)
+        return scores
+
+    def __call__(self, results):
+        video_info = results['video_info']
+        match_map = results['match_map']
+        anchor_xmin = results['anchor_xmin']
+        anchor_xmax = results['anchor_xmax']
+
+        video_second = video_info['duration_second']
+        video_labels = video_info['annotations']
+
+        gt_bbox = []
+        gt_iou_map = []
+        for gt in video_labels:
+            tmp_start = max(min(1, gt["segment"][0] / video_second), 0)
+            tmp_end = max(min(1, gt["segment"][1] / video_second), 0)
+            gt_bbox.append([tmp_start, tmp_end])
+            tmp_gt_iou_map = self.iou_with_anchors(match_map[:, 0],
+                                                   match_map[:, 1], tmp_start,
+                                                   tmp_end)
+            tmp_gt_iou_map = np.reshape(tmp_gt_iou_map,
+                                        [self.dscale, self.tscale])
+            gt_iou_map.append(tmp_gt_iou_map)
+        gt_iou_map = np.array(gt_iou_map)
+        gt_iou_map = np.max(gt_iou_map, axis=0)
+
+        gt_bbox = np.array(gt_bbox)
+        gt_xmins = gt_bbox[:, 0]
+        gt_xmaxs = gt_bbox[:, 1]
+        gt_len_small = 3 * self.tgap
+        gt_start_bboxs = np.stack(
+            (gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1)
+        gt_end_bboxs = np.stack(
+            (gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1)
+
+        match_score_start = []
+        for jdx in range(len(anchor_xmin)):
+            match_score_start.append(
+                np.max(
+                    self.ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx],
+                                          gt_start_bboxs[:, 0],
+                                          gt_start_bboxs[:, 1])))
+        match_score_end = []
+        for jdx in range(len(anchor_xmin)):
+            match_score_end.append(
+                np.max(
+                    self.ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx],
+                                          gt_end_bboxs[:, 0], gt_end_bboxs[:,
+                                                                           1])))
+
+        gt_start = np.array(match_score_start)
+        gt_end = np.array(match_score_end)
+
+        results['gt_iou_map'] = gt_iou_map.astype(self.datatype)
+        results['gt_start'] = gt_start.astype(self.datatype)
+        results['gt_end'] = gt_end.astype(self.datatype)
+        return results
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/augmentations.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/augmentations.py
new file mode 100644
index 0000000..24f3c71
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/augmentations.py
@@ -0,0 +1,1427 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import random
+from collections.abc import Sequence
+
+import cv2
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from PIL import Image
+
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class Scale(object):
+    """
+    Scale images.
+    Args:
+        short_size(float | int): Short size of an image will be scaled to the short_size.
+        fixed_ratio(bool): Set whether to zoom according to a fixed ratio. default: True
+        do_round(bool): Whether to round up when calculating the zoom ratio. default: False
+        backend(str): Choose pillow or cv2 as the graphics processing backend. default: 'pillow'
+    """
+    def __init__(self,
+                 short_size,
+                 fixed_ratio=True,
+                 keep_ratio=None,
+                 do_round=False,
+                 backend='pillow'):
+        self.short_size = short_size
+        assert (fixed_ratio and not keep_ratio) or (not fixed_ratio), \
+            f"fixed_ratio and keep_ratio cannot be true at the same time"
+        self.fixed_ratio = fixed_ratio
+        self.keep_ratio = keep_ratio
+        self.do_round = do_round
+
+        assert backend in [
+            'pillow', 'cv2'
+        ], f"Scale's backend must be pillow or cv2, but get {backend}"
+        self.backend = backend
+
+    def __call__(self, results):
+        """
+        Performs resize operations.
+        Args:
+            imgs (Sequence[PIL.Image]): List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            resized_imgs: List where each item is a PIL.Image after scaling.
+        """
+        imgs = results['imgs']
+        resized_imgs = []
+        for i in range(len(imgs)):
+            img = imgs[i]
+            if isinstance(img, np.ndarray):
+                h, w, _ = img.shape
+            elif isinstance(img, Image.Image):
+                w, h = img.size
+            else:
+                raise NotImplementedError
+            if (w <= h and w == self.short_size) or (h <= w
+                                                     and h == self.short_size):
+                if self.backend == 'pillow' and not isinstance(
+                        img, Image.Image):
+                    img = Image.fromarray(img)
+                resized_imgs.append(img)
+                continue
+
+            if w <= h:
+                ow = self.short_size
+                if self.fixed_ratio:
+                    oh = int(self.short_size * 4.0 / 3.0)
+                elif self.keep_ratio is False:
+                    oh = self.short_size
+                else:
+                    scale_factor = self.short_size / w
+                    oh = int(h * float(scale_factor) +
+                             0.5) if self.do_round else int(h *
+                                                            self.short_size / w)
+                    ow = int(w * float(scale_factor) +
+                             0.5) if self.do_round else self.short_size
+            else:
+                oh = self.short_size
+                if self.fixed_ratio:
+                    ow = int(self.short_size * 4.0 / 3.0)
+                elif self.keep_ratio is False:
+                    ow = self.short_size
+                else:
+                    scale_factor = self.short_size / h
+                    oh = int(h * float(scale_factor) +
+                             0.5) if self.do_round else self.short_size
+                    ow = int(w * float(scale_factor) +
+                             0.5) if self.do_round else int(w *
+                                                            self.short_size / h)
+            if self.backend == 'pillow':
+                resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))
+            elif self.backend == 'cv2' and (self.keep_ratio is not None):
+                resized_imgs.append(
+                    cv2.resize(img, (ow, oh), interpolation=cv2.INTER_LINEAR))
+            else:
+                resized_imgs.append(
+                    Image.fromarray(
+                        cv2.resize(np.asarray(img), (ow, oh),
+                                   interpolation=cv2.INTER_LINEAR)))
+        results['imgs'] = resized_imgs
+        return results
+
+
+@PIPELINES.register()
+class RandomCrop(object):
+    """
+    Random crop images.
+    Args:
+        target_size(int): Random crop a square with the target_size from an image.
+    """
+    def __init__(self, target_size):
+        self.target_size = target_size
+
+    def __call__(self, results):
+        """
+        Performs random crop operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            crop_imgs: List where each item is a PIL.Image after random crop.
+        """
+        imgs = results['imgs']
+        if 'backend' in results and results['backend'] == 'pyav':  # [c,t,h,w]
+            h, w = imgs.shape[2:]
+        else:
+            w, h = imgs[0].size
+        th, tw = self.target_size, self.target_size
+
+        assert (w >= self.target_size) and (h >= self.target_size), \
+            "image width({}) and height({}) should be larger than crop size".format(
+                w, h, self.target_size)
+
+        crop_images = []
+        if 'backend' in results and results['backend'] == 'pyav':
+            x1 = np.random.randint(0, w - tw)
+            y1 = np.random.randint(0, h - th)
+            crop_images = imgs[:, :, y1:y1 + th, x1:x1 + tw]  # [C, T, th, tw]
+        else:
+            x1 = random.randint(0, w - tw)
+            y1 = random.randint(0, h - th)
+            for img in imgs:
+                if w == tw and h == th:
+                    crop_images.append(img)
+                else:
+                    crop_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+        results['imgs'] = crop_images
+        return results
+
+
+@PIPELINES.register()
+class RandomResizedCrop(RandomCrop):
+    def __init__(self,
+                 area_range=(0.08, 1.0),
+                 aspect_ratio_range=(3 / 4, 4 / 3),
+                 target_size=224,
+                 backend='cv2'):
+
+        self.area_range = area_range
+        self.aspect_ratio_range = aspect_ratio_range
+        self.target_size = target_size
+        self.backend = backend
+
+    @staticmethod
+    def get_crop_bbox(img_shape,
+                      area_range,
+                      aspect_ratio_range,
+                      max_attempts=10):
+
+        assert 0 < area_range[0] <= area_range[1] <= 1
+        assert 0 < aspect_ratio_range[0] <= aspect_ratio_range[1]
+
+        img_h, img_w = img_shape
+        area = img_h * img_w
+
+        min_ar, max_ar = aspect_ratio_range
+        aspect_ratios = np.exp(
+            np.random.uniform(np.log(min_ar), np.log(max_ar),
+                              size=max_attempts))
+        target_areas = np.random.uniform(*area_range, size=max_attempts) * area
+        candidate_crop_w = np.round(np.sqrt(target_areas *
+                                            aspect_ratios)).astype(np.int32)
+        candidate_crop_h = np.round(np.sqrt(target_areas /
+                                            aspect_ratios)).astype(np.int32)
+
+        for i in range(max_attempts):
+            crop_w = candidate_crop_w[i]
+            crop_h = candidate_crop_h[i]
+            if crop_h <= img_h and crop_w <= img_w:
+                x_offset = random.randint(0, img_w - crop_w)
+                y_offset = random.randint(0, img_h - crop_h)
+                return x_offset, y_offset, x_offset + crop_w, y_offset + crop_h
+
+        # Fallback
+        crop_size = min(img_h, img_w)
+        x_offset = (img_w - crop_size) // 2
+        y_offset = (img_h - crop_size) // 2
+        return x_offset, y_offset, x_offset + crop_size, y_offset + crop_size
+
+    def __call__(self, results):
+        imgs = results['imgs']
+        if self.backend == 'pillow':
+            img_w, img_h = imgs[0].size
+        elif self.backend == 'cv2':
+            img_h, img_w, _ = imgs[0].shape
+        elif self.backend == 'pyav':
+            img_h, img_w = imgs.shape[2:]  # [cthw]
+        else:
+            raise NotImplementedError
+
+        left, top, right, bottom = self.get_crop_bbox(
+            (img_h, img_w), self.area_range, self.aspect_ratio_range)
+
+        if self.backend == 'pillow':
+            img_w, img_h = imgs[0].size
+            imgs = [img.crop(left, top, right, bottom) for img in imgs]
+        elif self.backend == 'cv2':
+            img_h, img_w, _ = imgs[0].shape
+            imgs = [img[top:bottom, left:right] for img in imgs]
+        elif self.backend == 'pyav':
+            img_h, img_w = imgs.shape[2:]  # [cthw]
+            imgs = imgs[:, :, top:bottom, left:right]
+        else:
+            raise NotImplementedError
+        results['imgs'] = imgs
+        return results
+
+
+@PIPELINES.register()
+class CenterCrop(object):
+    """
+    Center crop images.
+    Args:
+        target_size(int): Center crop a square with the target_size from an image.
+        do_round(bool): Whether to round up the coordinates of the upper left corner of the cropping area. default: True
+    """
+    def __init__(self, target_size, do_round=True, backend='pillow'):
+        self.target_size = target_size
+        self.do_round = do_round
+        self.backend = backend
+
+    def __call__(self, results):
+        """
+        Performs Center crop operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            ccrop_imgs: List where each item is a PIL.Image after Center crop.
+        """
+        imgs = results['imgs']
+        ccrop_imgs = []
+        th, tw = self.target_size, self.target_size
+        if isinstance(imgs, paddle.Tensor):
+            h, w = imgs.shape[-2:]
+            x1 = int(round((w - tw) / 2.0)) if self.do_round else (w - tw) // 2
+            y1 = int(round((h - th) / 2.0)) if self.do_round else (h - th) // 2
+            ccrop_imgs = imgs[:, :, y1:y1 + th, x1:x1 + tw]
+        else:
+            for img in imgs:
+                if self.backend == 'pillow':
+                    w, h = img.size
+                elif self.backend == 'cv2':
+                    h, w, _ = img.shape
+                else:
+                    raise NotImplementedError
+                assert (w >= self.target_size) and (h >= self.target_size), \
+                    "image width({}) and height({}) should be larger than crop size".format(
+                        w, h, self.target_size)
+                x1 = int(round(
+                    (w - tw) / 2.0)) if self.do_round else (w - tw) // 2
+                y1 = int(round(
+                    (h - th) / 2.0)) if self.do_round else (h - th) // 2
+                if self.backend == 'cv2':
+                    ccrop_imgs.append(img[y1:y1 + th, x1:x1 + tw])
+                elif self.backend == 'pillow':
+                    ccrop_imgs.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+        results['imgs'] = ccrop_imgs
+        return results
+
+
+@PIPELINES.register()
+class MultiScaleCrop(object):
+    """
+    Random crop images in with multiscale sizes
+    Args:
+        target_size(int): Random crop a square with the target_size from an image.
+        scales(int): List of candidate cropping scales.
+        max_distort(int): Maximum allowable deformation combination distance.
+        fix_crop(int): Whether to fix the cutting start point.
+        allow_duplication(int): Whether to allow duplicate candidate crop starting points.
+        more_fix_crop(int): Whether to allow more cutting starting points.
+    """
+    def __init__(
+            self,
+            target_size,  # NOTE: named target size now, but still pass short size in it!
+            scales=None,
+            max_distort=1,
+            fix_crop=True,
+            allow_duplication=False,
+            more_fix_crop=True,
+            backend='pillow'):
+
+        self.target_size = target_size
+        self.scales = scales if scales else [1, .875, .75, .66]
+        self.max_distort = max_distort
+        self.fix_crop = fix_crop
+        self.allow_duplication = allow_duplication
+        self.more_fix_crop = more_fix_crop
+        assert backend in [
+            'pillow', 'cv2'
+        ], f"MultiScaleCrop's backend must be pillow or cv2, but get {backend}"
+        self.backend = backend
+
+    def __call__(self, results):
+        """
+        Performs MultiScaleCrop operations.
+        Args:
+            imgs: List where wach item is a PIL.Image.
+            XXX:
+        results:
+
+        """
+        imgs = results['imgs']
+
+        input_size = [self.target_size, self.target_size]
+
+        im_size = imgs[0].size
+
+        # get random crop offset
+        def _sample_crop_size(im_size):
+            image_w, image_h = im_size[0], im_size[1]
+
+            base_size = min(image_w, image_h)
+            crop_sizes = [int(base_size * x) for x in self.scales]
+            crop_h = [
+                input_size[1] if abs(x - input_size[1]) < 3 else x
+                for x in crop_sizes
+            ]
+            crop_w = [
+                input_size[0] if abs(x - input_size[0]) < 3 else x
+                for x in crop_sizes
+            ]
+
+            pairs = []
+            for i, h in enumerate(crop_h):
+                for j, w in enumerate(crop_w):
+                    if abs(i - j) <= self.max_distort:
+                        pairs.append((w, h))
+            crop_pair = random.choice(pairs)
+            if not self.fix_crop:
+                w_offset = random.randint(0, image_w - crop_pair[0])
+                h_offset = random.randint(0, image_h - crop_pair[1])
+            else:
+                w_step = (image_w - crop_pair[0]) / 4
+                h_step = (image_h - crop_pair[1]) / 4
+
+                ret = list()
+                ret.append((0, 0))  # upper left
+                if self.allow_duplication or w_step != 0:
+                    ret.append((4 * w_step, 0))  # upper right
+                if self.allow_duplication or h_step != 0:
+                    ret.append((0, 4 * h_step))  # lower left
+                if self.allow_duplication or (h_step != 0 and w_step != 0):
+                    ret.append((4 * w_step, 4 * h_step))  # lower right
+                if self.allow_duplication or (h_step != 0 or w_step != 0):
+                    ret.append((2 * w_step, 2 * h_step))  # center
+
+                if self.more_fix_crop:
+                    ret.append((0, 2 * h_step))  # center left
+                    ret.append((4 * w_step, 2 * h_step))  # center right
+                    ret.append((2 * w_step, 4 * h_step))  # lower center
+                    ret.append((2 * w_step, 0 * h_step))  # upper center
+
+                    ret.append((1 * w_step, 1 * h_step))  # upper left quarter
+                    ret.append((3 * w_step, 1 * h_step))  # upper right quarter
+                    ret.append((1 * w_step, 3 * h_step))  # lower left quarter
+                    ret.append((3 * w_step, 3 * h_step))  # lower righ quarter
+
+                w_offset, h_offset = random.choice(ret)
+
+            return crop_pair[0], crop_pair[1], w_offset, h_offset
+
+        crop_w, crop_h, offset_w, offset_h = _sample_crop_size(im_size)
+        crop_img_group = [
+            img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h))
+            for img in imgs
+        ]
+        if self.backend == 'pillow':
+            ret_img_group = [
+                img.resize((input_size[0], input_size[1]), Image.BILINEAR)
+                for img in crop_img_group
+            ]
+        else:
+            ret_img_group = [
+                Image.fromarray(
+                    cv2.resize(np.asarray(img),
+                               dsize=(input_size[0], input_size[1]),
+                               interpolation=cv2.INTER_LINEAR))
+                for img in crop_img_group
+            ]
+        results['imgs'] = ret_img_group
+        return results
+
+
+@PIPELINES.register()
+class RandomFlip(object):
+    """
+    Random Flip images.
+    Args:
+        p(float): Random flip images with the probability p.
+    """
+    def __init__(self, p=0.5):
+        self.p = p
+
+    def __call__(self, results):
+        """
+        Performs random flip operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            flip_imgs: List where each item is a PIL.Image after random flip.
+        """
+        imgs = results['imgs']
+        v = random.random()
+        if v < self.p:
+            if isinstance(imgs, paddle.Tensor):
+                results['imgs'] = paddle.flip(imgs, axis=[3])
+            elif isinstance(imgs[0], np.ndarray):
+                results['imgs'] = [cv2.flip(img, 1, img) for img in imgs
+                                   ]  # [[h,w,c], [h,w,c], ..., [h,w,c]]
+            else:
+                results['imgs'] = [
+                    img.transpose(Image.FLIP_LEFT_RIGHT) for img in imgs
+                ]
+        else:
+            results['imgs'] = imgs
+        return results
+
+
+@PIPELINES.register()
+class RandomBrightness(object):
+    """
+    Random Brightness images.
+    Args:
+        p(float): Random brightness images with the probability p.
+    """
+    def __init__(self, p=0.1, brightness=1):
+        self.p = p
+        self.brightness = brightness
+
+    def __call__(self, results):
+        """
+        Performs random brightness operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            brightness_imgs: List where each item is a PIL.Image after random brightness.
+        """
+        imgs = results['imgs']
+        v = random.random()
+
+        if v < self.p:
+            transform = ColorJitter(brightness=self.brightness)
+            results['imgs'] = [transform(img) for img in imgs]
+        else:
+            results['imgs'] = imgs
+        return results
+
+
+@PIPELINES.register()
+class RandomSaturation(object):
+    """
+    Random Saturation images.
+    Args:
+        p(float): Random saturation images with the probability p.
+    """
+    def __init__(self, p=0.1, saturation=2):
+        self.p = p
+        self.saturation = saturation
+
+    def __call__(self, results):
+        """
+        Performs random saturation operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            saturation_imgs: List where each item is a PIL.Image after random saturation.
+        """
+        imgs = results['imgs']
+        v = random.random()
+
+        if v < self.p:
+            transform = ColorJitter(saturation=self.saturation)
+            results['imgs'] = [transform(img) for img in imgs]
+        else:
+            results['imgs'] = imgs
+        return results
+
+
+@PIPELINES.register()
+class RandomHue(object):
+    """
+    Random Hue images.
+    Args:
+        p(float): Random hue images with the probability p.
+    """
+    def __init__(self, p=0.1, hue=0.5):
+        self.p = p
+        self.hue = hue
+
+    def __call__(self, results):
+        """
+        Performs random hue operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            hue_imgs: List where each item is a PIL.Image after random hue.
+        """
+        imgs = results['imgs']
+        v = random.random()
+
+        if v < self.p:
+            transform = ColorJitter(hue=self.hue)
+            results['imgs'] = [transform(img) for img in imgs]
+        else:
+            results['imgs'] = imgs
+        return results
+
+
+@PIPELINES.register()
+class RandomGamma(object):
+    """
+    Random Gamma images.
+    Args:
+        p(float): Random gamma images with the probability p.
+        gamma (float): Non negative real number, same as `\\gamma` in the equation.
+                       gamma larger than 1 make the shadows darker,
+                      while gamma smaller than 1 make dark regions lighter.
+    """
+    def __init__(self, p=0.1, gamma=0.2):
+        self.p = p
+        self.value = [1 - gamma, 1 + gamma]
+        self.value[0] = max(self.value[0], 0)
+
+    def _adust_gamma(self, img, gamma, gain=1.0):
+        flag = False
+        if isinstance(img, np.ndarray):
+            flag = True
+            img = Image.fromarray(img)
+        input_mode = img.mode
+        img = img.convert("RGB")
+        gamma_map = [
+            int((255 + 1 - 1e-3) * gain * pow(ele / 255.0, gamma))
+            for ele in range(256)
+        ] * 3
+        img = img.point(
+            gamma_map)  # use PIL's point-function to accelerate this part
+        img = img.convert(input_mode)
+        if flag:
+            img = np.array(img)
+        return img
+
+    def __call__(self, results):
+        """
+        Performs random gamma operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            gamma_imgs: List where each item is a PIL.Image after random gamma.
+        """
+        imgs = results['imgs']
+        v = random.random()
+
+        if v < self.p:
+            gamma = random.uniform(self.value[0], self.value[1])
+            results['imgs'] = [self._adust_gamma(img, gamma) for img in imgs]
+        else:
+            results['imgs'] = imgs
+        return results
+
+
+@PIPELINES.register()
+class Image2Array(object):
+    """
+    transfer PIL.Image to Numpy array and transpose dimensions from 'dhwc' to 'dchw'.
+    Args:
+        transpose: whether to transpose or not, default True, False for slowfast.
+    """
+    def __init__(self, transpose=True, data_format='tchw'):
+        assert data_format in [
+            'tchw', 'cthw'
+        ], f"Target format must in ['tchw', 'cthw'], but got {data_format}"
+        self.transpose = transpose
+        self.data_format = data_format
+
+    def __call__(self, results):
+        """
+        Performs Image to NumpyArray operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            np_imgs: Numpy array.
+        """
+        imgs = results['imgs']
+        if 'backend' in results and results[
+                'backend'] == 'pyav':  # [T,H,W,C] in [0, 1]
+            if self.transpose:
+                if self.data_format == 'tchw':
+                    t_imgs = imgs.transpose((0, 3, 1, 2))  # tchw
+                else:
+                    t_imgs = imgs.transpose((3, 0, 1, 2))  # cthw
+            results['imgs'] = t_imgs
+        else:
+            t_imgs = np.stack(imgs).astype('float32')
+            if self.transpose:
+                if self.data_format == 'tchw':
+                    t_imgs = t_imgs.transpose(0, 3, 1, 2)  # tchw
+                else:
+                    t_imgs = t_imgs.transpose(3, 0, 1, 2)  # cthw
+            results['imgs'] = t_imgs
+        return results
+
+
+@PIPELINES.register()
+class Normalization(object):
+    """
+    Normalization.
+    Args:
+        mean(Sequence[float]): mean values of different channels.
+        std(Sequence[float]): std values of different channels.
+        tensor_shape(list): size of mean, default [3,1,1]. For slowfast, [1,1,1,3]
+    """
+    def __init__(self, mean, std, tensor_shape=[3, 1, 1], inplace=False):
+        if not isinstance(mean, Sequence):
+            raise TypeError(
+                f'Mean must be list, tuple or np.ndarray, but got {type(mean)}')
+        if not isinstance(std, Sequence):
+            raise TypeError(
+                f'Std must be list, tuple or np.ndarray, but got {type(std)}')
+
+        self.inplace = inplace
+        if not inplace:
+            self.mean = np.array(mean).reshape(tensor_shape).astype(np.float32)
+            self.std = np.array(std).reshape(tensor_shape).astype(np.float32)
+        else:
+            self.mean = np.array(mean, dtype=np.float32)
+            self.std = np.array(std, dtype=np.float32)
+
+    def __call__(self, results):
+        """
+        Performs normalization operations.
+        Args:
+            imgs: Numpy array.
+        return:
+            np_imgs: Numpy array after normalization.
+        """
+        if self.inplace:
+            n = len(results['imgs'])
+            h, w, c = results['imgs'][0].shape
+            norm_imgs = np.empty((n, h, w, c), dtype=np.float32)
+            for i, img in enumerate(results['imgs']):
+                norm_imgs[i] = img
+
+            for img in norm_imgs:  # [n,h,w,c]
+                mean = np.float64(self.mean.reshape(1, -1))  # [1, 3]
+                stdinv = 1 / np.float64(self.std.reshape(1, -1))  # [1, 3]
+                cv2.subtract(img, mean, img)
+                cv2.multiply(img, stdinv, img)
+        else:
+            imgs = results['imgs']
+            norm_imgs = imgs / 255.0
+            norm_imgs -= self.mean
+            norm_imgs /= self.std
+            if 'backend' in results and results['backend'] == 'pyav':
+                norm_imgs = paddle.to_tensor(norm_imgs, dtype=paddle.float32)
+        results['imgs'] = norm_imgs
+        return results
+
+
+@PIPELINES.register()
+class JitterScale(object):
+    """
+    Scale image, while the target short size is randomly select between min_size and max_size.
+    Args:
+        min_size: Lower bound for random sampler.
+        max_size: Higher bound for random sampler.
+    """
+    def __init__(self,
+                 min_size,
+                 max_size,
+                 short_cycle_factors=[0.5, 0.7071],
+                 default_min_size=256):
+        self.default_min_size = default_min_size
+        self.orig_min_size = self.min_size = min_size
+        self.max_size = max_size
+        self.short_cycle_factors = short_cycle_factors
+
+    def __call__(self, results):
+        """
+        Performs jitter resize operations.
+        Args:
+            imgs (Sequence[PIL.Image]): List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            resized_imgs: List where each item is a PIL.Image after scaling.
+        """
+        short_cycle_idx = results.get('short_cycle_idx')
+        if short_cycle_idx in [0, 1]:
+            self.min_size = int(
+                round(self.short_cycle_factors[short_cycle_idx] *
+                      self.default_min_size))
+        else:
+            self.min_size = self.orig_min_size
+
+        imgs = results['imgs']
+        size = int(round(np.random.uniform(self.min_size, self.max_size)))
+        assert (len(imgs) >= 1), \
+            "len(imgs):{} should be larger than 1".format(len(imgs))
+
+        if 'backend' in results and results['backend'] == 'pyav':
+            height, width = imgs.shape[2:]
+        else:
+            width, height = imgs[0].size
+        if (width <= height and width == size) or (height <= width
+                                                   and height == size):
+            return results
+
+        new_width = size
+        new_height = size
+        if width < height:
+            new_height = int(math.floor((float(height) / width) * size))
+        else:
+            new_width = int(math.floor((float(width) / height) * size))
+
+        if 'backend' in results and results['backend'] == 'pyav':
+            frames_resize = F.interpolate(imgs,
+                                          size=(new_height, new_width),
+                                          mode="bilinear",
+                                          align_corners=False)  # [c,t,h,w]
+        else:
+            frames_resize = []
+            for j in range(len(imgs)):
+                img = imgs[j]
+                scale_img = img.resize((new_width, new_height), Image.BILINEAR)
+                frames_resize.append(scale_img)
+
+        results['imgs'] = frames_resize
+        return results
+
+
+@PIPELINES.register()
+class MultiCenterCrop(object):
+    """
+    center crop, left center crop right center crop
+    Args:
+        target_size(int): Random crop a square with the target_size from an image.
+    """
+    def __init__(self, target_size):
+        self.target_size = target_size
+
+    def __call__(self, results):
+        """
+        Performs random crop operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            crop_imgs: List where each item is a PIL.Image after random crop.
+        """
+        imgs = results['imgs']
+        if 'backend' in results and results['backend'] == 'pyav':  # [c,t,h,w]
+            h, w = imgs.shape[2:]
+        else:
+            w, h = imgs[0].size
+        th, tw = self.target_size, self.target_size
+
+        assert (w >= self.target_size) and (h >= self.target_size), \
+            "image width({}) and height({}) should be larger than crop size".format(
+                w, h, self.target_size)
+
+        crop_images = []
+        #just for tensor
+        crop_imgs_center = []
+        crop_imgs_left = []
+        crop_imgs_right = []
+        if 'backend' in results and results['backend'] == 'pyav':
+            #center_corp
+            x1 = 0
+            if w > self.target_size:
+                x1 = int((w - self.target_size) / 2.0)
+            y1 = 0
+            if h > self.target_size:
+                y1 = int((h - self.target_size) / 2.0)
+            crop_imgs_center = imgs[:, :, y1:y1 + th,
+                                    x1:x1 + tw].numpy()  # [C, T, th, tw]
+            #left_crop
+            x1 = 0
+            y1 = 0
+            if h > self.target_size:
+                y1 = int((h - self.target_size) / 2.0)
+            crop_imgs_left = imgs[:, :, y1:y1 + th, x1:x1 + tw].numpy()
+            #right_crop
+            x1 = 0
+            y1 = 0
+            if w > self.target_size:
+                x1 = w - self.target_size
+            if h > self.target_size:
+                y1 = int((h - self.target_size) / 2.0)
+            crop_imgs_right = imgs[:, :, y1:y1 + th, x1:x1 + tw].numpy()
+            crop_imgs = np.concatenate(
+                (crop_imgs_center, crop_imgs_left, crop_imgs_right), axis=1)
+            crop_images = paddle.to_tensor(crop_imgs)
+
+        else:
+            x1 = 0
+            if w > self.target_size:
+                x1 = random.randint(0, w - tw)
+            y1 = 0
+            if h > self.target_size:
+                y1 = random.randint(0, h - th)
+            for img in imgs:
+                if w == tw and h == th:
+                    crop_images.append(img)
+                else:
+                    crop_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+        results['imgs'] = crop_images
+        return results
+
+
+@PIPELINES.register()
+class MultiCrop(object):
+    """
+    Random crop image.
+    This operation can perform multi-crop during multi-clip test, as in slowfast model.
+    Args:
+        target_size(int): Random crop a square with the target_size from an image.
+    """
+    def __init__(self,
+                 target_size,
+                 default_crop_size=224,
+                 short_cycle_factors=[0.5, 0.7071],
+                 test_mode=False):
+        self.orig_target_size = self.target_size = target_size
+        self.short_cycle_factors = short_cycle_factors
+        self.default_crop_size = default_crop_size
+        self.test_mode = test_mode
+
+    def __call__(self, results):
+        """
+        Performs random crop operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            crop_imgs: List where each item is a PIL.Image after random crop.
+        """
+        imgs = results['imgs']
+        spatial_sample_index = results['spatial_sample_index']
+        spatial_num_clips = results['spatial_num_clips']
+
+        short_cycle_idx = results.get('short_cycle_idx')
+        if short_cycle_idx in [0, 1]:
+            self.target_size = int(
+                round(self.short_cycle_factors[short_cycle_idx] *
+                      self.default_crop_size))
+        else:
+            self.target_size = self.orig_target_size  # use saved value before call
+
+        w, h = imgs[0].size
+        if w == self.target_size and h == self.target_size:
+            return results
+
+        assert (w >= self.target_size) and (h >= self.target_size), \
+            "image width({}) and height({}) should be larger than crop size({},{})".format(w, h, self.target_size, self.target_size)
+        frames_crop = []
+        if not self.test_mode:
+            x_offset = random.randint(0, w - self.target_size)
+            y_offset = random.randint(0, h - self.target_size)
+        else:  # multi-crop
+            x_gap = int(
+                math.ceil((w - self.target_size) / (spatial_num_clips - 1)))
+            y_gap = int(
+                math.ceil((h - self.target_size) / (spatial_num_clips - 1)))
+            if h > w:
+                x_offset = int(math.ceil((w - self.target_size) / 2))
+                if spatial_sample_index == 0:
+                    y_offset = 0
+                elif spatial_sample_index == spatial_num_clips - 1:
+                    y_offset = h - self.target_size
+                else:
+                    y_offset = y_gap * spatial_sample_index
+            else:
+                y_offset = int(math.ceil((h - self.target_size) / 2))
+                if spatial_sample_index == 0:
+                    x_offset = 0
+                elif spatial_sample_index == spatial_num_clips - 1:
+                    x_offset = w - self.target_size
+                else:
+                    x_offset = x_gap * spatial_sample_index
+
+        for img in imgs:
+            nimg = img.crop((x_offset, y_offset, x_offset + self.target_size,
+                             y_offset + self.target_size))
+            frames_crop.append(nimg)
+        results['imgs'] = frames_crop
+        return results
+
+
+@PIPELINES.register()
+class PackOutput(object):
+    """
+    In slowfast model, we want to get slow pathway from fast pathway based on
+    alpha factor.
+    Args:
+        alpha(int): temporal length of fast/slow
+    """
+    def __init__(self, alpha):
+        self.alpha = alpha
+
+    def __call__(self, results):
+        fast_pathway = results['imgs']
+
+        # sample num points between start and end
+        slow_idx_start = 0
+        slow_idx_end = fast_pathway.shape[0] - 1
+        slow_idx_num = fast_pathway.shape[0] // self.alpha
+        slow_idxs_select = np.linspace(slow_idx_start, slow_idx_end,
+                                       slow_idx_num).astype("int64")
+        slow_pathway = fast_pathway[slow_idxs_select]
+
+        # T H W C -> C T H W.
+        slow_pathway = slow_pathway.transpose(3, 0, 1, 2)
+        fast_pathway = fast_pathway.transpose(3, 0, 1, 2)
+
+        # slow + fast
+        frames_list = [slow_pathway, fast_pathway]
+        results['imgs'] = frames_list
+        return results
+
+
+@PIPELINES.register()
+class GroupFullResSample(object):
+    def __init__(self, crop_size, flip=False):
+        self.crop_size = crop_size if not isinstance(crop_size, int) else (
+            crop_size, crop_size)
+        self.flip = flip
+
+    def __call__(self, results):
+        img_group = results['imgs']
+
+        image_w, image_h = img_group[0].size
+        crop_w, crop_h = self.crop_size
+
+        w_step = (image_w - crop_w) // 4
+        h_step = (image_h - crop_h) // 4
+
+        offsets = list()
+        offsets.append((0 * w_step, 2 * h_step))  # left
+        offsets.append((4 * w_step, 2 * h_step))  # right
+        offsets.append((2 * w_step, 2 * h_step))  # center
+
+        oversample_group = list()
+        for o_w, o_h in offsets:
+            normal_group = list()
+            flip_group = list()
+            for i, img in enumerate(img_group):
+                crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
+                normal_group.append(crop)
+                if self.flip:
+                    flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
+                    flip_group.append(flip_crop)
+
+            oversample_group.extend(normal_group)
+            if self.flip:
+                oversample_group.extend(flip_group)
+
+        results['imgs'] = oversample_group
+        return results
+
+
+@PIPELINES.register()
+class TenCrop:
+    """
+    Crop out 5 regions (4 corner points + 1 center point) from the picture,
+    and then flip the cropping result to get 10 cropped images, which can make the prediction result more robust.
+    Args:
+        target_size(int | tuple[int]): (w, h) of target size for crop.
+    """
+    def __init__(self, target_size):
+        self.target_size = (target_size, target_size)
+
+    def __call__(self, results):
+        imgs = results['imgs']
+        img_w, img_h = imgs[0].size
+        crop_w, crop_h = self.target_size
+        w_step = (img_w - crop_w) // 4
+        h_step = (img_h - crop_h) // 4
+        offsets = [
+            (0, 0),
+            (4 * w_step, 0),
+            (0, 4 * h_step),
+            (4 * w_step, 4 * h_step),
+            (2 * w_step, 2 * h_step),
+        ]
+        img_crops = list()
+        for x_offset, y_offset in offsets:
+            crop = [
+                img.crop(
+                    (x_offset, y_offset, x_offset + crop_w, y_offset + crop_h))
+                for img in imgs
+            ]
+            crop_fliped = [
+                timg.transpose(Image.FLIP_LEFT_RIGHT) for timg in crop
+            ]
+            img_crops.extend(crop)
+            img_crops.extend(crop_fliped)
+
+        results['imgs'] = img_crops
+        return results
+
+
+@PIPELINES.register()
+class UniformCrop:
+    """
+    Perform uniform spatial sampling on the images,
+    select the two ends of the long side and the middle position (left middle right or top middle bottom) 3 regions.
+    Args:
+        target_size(int | tuple[int]): (w, h) of target size for crop.
+    """
+    def __init__(self, target_size, backend='cv2'):
+        if isinstance(target_size, tuple):
+            self.target_size = target_size
+        elif isinstance(target_size, int):
+            self.target_size = (target_size, target_size)
+        else:
+            raise TypeError(
+                f'target_size must be int or tuple[int], but got {type(target_size)}'
+            )
+        self.backend = backend
+
+    def __call__(self, results):
+
+        imgs = results['imgs']
+        if 'backend' in results and results['backend'] == 'pyav':  # [c,t,h,w]
+            img_h, img_w = imgs.shape[2:]
+        elif self.backend == 'pillow':
+            img_w, img_h = imgs[0].size
+        else:
+            img_h, img_w = imgs[0].shape[:2]
+
+        crop_w, crop_h = self.target_size
+        if crop_h == img_h:
+            w_step = (img_w - crop_w) // 2
+            offsets = [
+                (0, 0),
+                (w_step * 2, 0),
+                (w_step, 0),
+            ]
+        elif crop_w == img_w:
+            h_step = (img_h - crop_h) // 2
+            offsets = [
+                (0, 0),
+                (0, h_step * 2),
+                (0, h_step),
+            ]
+        else:
+            raise ValueError(
+                f"img_w({img_w}) == crop_w({crop_w}) or img_h({img_h}) == crop_h({crop_h})"
+            )
+        img_crops = []
+        if 'backend' in results and results['backend'] == 'pyav':  # [c,t,h,w]
+            for x_offset, y_offset in offsets:
+                crop = imgs[:, :, y_offset:y_offset + crop_h,
+                            x_offset:x_offset + crop_w]
+                img_crops.append(crop)
+            img_crops = paddle.concat(img_crops, axis=1)
+        else:
+            if self.backend == 'pillow':
+                for x_offset, y_offset in offsets:
+                    crop = [
+                        img.crop((x_offset, y_offset, x_offset + crop_w,
+                                  y_offset + crop_h)) for img in imgs
+                    ]
+                    img_crops.extend(crop)
+            else:
+                for x_offset, y_offset in offsets:
+                    crop = [
+                        img[y_offset:y_offset + crop_h,
+                            x_offset:x_offset + crop_w] for img in imgs
+                    ]
+                    img_crops.extend(crop)
+        results['imgs'] = img_crops
+        return results
+
+
+@PIPELINES.register()
+class GroupResize(object):
+    def __init__(self, height, width, scale, K, mode='train'):
+        self.height = height
+        self.width = width
+        self.scale = scale
+        self.resize = {}
+        self.K = np.array(K, dtype=np.float32)
+        self.mode = mode
+        for i in range(self.scale):
+            s = 2**i
+            self.resize[i] = paddle.vision.transforms.Resize(
+                (self.height // s, self.width // s), interpolation='lanczos')
+
+    def __call__(self, results):
+        if self.mode == 'infer':
+            imgs = results['imgs']
+            for k in list(imgs):  # ("color", 0, -1)
+                if "color" in k or "color_n" in k:
+                    n, im, _ = k
+                    for i in range(self.scale):
+                        imgs[(n, im, i)] = self.resize[i](imgs[(n, im, i - 1)])
+        else:
+            imgs = results['imgs']
+            for scale in range(self.scale):
+                K = self.K.copy()
+
+                K[0, :] *= self.width // (2**scale)
+                K[1, :] *= self.height // (2**scale)
+
+                inv_K = np.linalg.pinv(K)
+                imgs[("K", scale)] = K
+                imgs[("inv_K", scale)] = inv_K
+
+            for k in list(imgs):
+                if "color" in k or "color_n" in k:
+                    n, im, i = k
+                    for i in range(self.scale):
+                        imgs[(n, im, i)] = self.resize[i](imgs[(n, im, i - 1)])
+
+            results['imgs'] = imgs
+        return results
+
+
+@PIPELINES.register()
+class ColorJitter(object):
+    """Randomly change the brightness, contrast, saturation and hue of an image.
+    """
+    def __init__(self,
+                 brightness=0,
+                 contrast=0,
+                 saturation=0,
+                 hue=0,
+                 mode='train',
+                 p=0.5,
+                 keys=None):
+        self.mode = mode
+        self.colorjitter = paddle.vision.transforms.ColorJitter(
+            brightness, contrast, saturation, hue)
+        self.p = p
+
+    def __call__(self, results):
+        """
+        Args:
+            results (PIL Image): Input image.
+
+        Returns:
+            PIL Image: Color jittered image.
+        """
+
+        do_color_aug = random.random() > self.p
+        imgs = results['imgs']
+        for k in list(imgs):
+            f = imgs[k]
+            if "color" in k or "color_n" in k:
+                n, im, i = k
+                imgs[(n, im, i)] = f
+                if do_color_aug:
+                    imgs[(n + "_aug", im, i)] = self.colorjitter(f)
+                else:
+                    imgs[(n + "_aug", im, i)] = f
+        if self.mode == "train":
+            for i in results['frame_idxs']:
+                del imgs[("color", i, -1)]
+                del imgs[("color_aug", i, -1)]
+                del imgs[("color_n", i, -1)]
+                del imgs[("color_n_aug", i, -1)]
+        else:
+            for i in results['frame_idxs']:
+                del imgs[("color", i, -1)]
+                del imgs[("color_aug", i, -1)]
+
+        results['img'] = imgs
+        return results
+
+
+@PIPELINES.register()
+class GroupRandomFlip(object):
+    def __init__(self, p=0.5):
+        self.p = p
+
+    def __call__(self, results):
+
+        imgs = results['imgs']
+        do_flip = random.random() > self.p
+        if do_flip:
+            for k in list(imgs):
+                if "color" in k or "color_n" in k:
+                    n, im, i = k
+                    imgs[(n, im,
+                          i)] = imgs[(n, im,
+                                      i)].transpose(Image.FLIP_LEFT_RIGHT)
+            if "depth_gt" in imgs:
+                imgs['depth_gt'] = np.array(np.fliplr(imgs['depth_gt']))
+
+        results['imgs'] = imgs
+        return results
+
+
+@PIPELINES.register()
+class ToArray(object):
+    def __init__(self):
+        pass
+
+    def __call__(self, results):
+        imgs = results['imgs']
+        for k in list(imgs):
+            if "color" in k or "color_n" in k or "color_aug" in k or "color_n_aug" in k:
+                n, im, i = k
+                imgs[(n, im,
+                      i)] = np.array(imgs[(n, im, i)]).astype('float32') / 255.0
+                imgs[(n, im, i)] = imgs[(n, im, i)].transpose((2, 0, 1))
+        if "depth_gt" in imgs:
+            imgs['depth_gt'] = np.array(imgs['depth_gt']).astype('float32')
+
+        results['imgs'] = imgs
+        return results
+
+
+@PIPELINES.register()
+class YowoAug(object):
+    def __init__(self, target_size=224, jitter=0.2, hue=0.1, saturation=1.5, exposure=1.5, valid_mode=False):
+        self.shape = (target_size, target_size)
+        self.jitter = jitter
+        self.hue = hue
+        self.saturation = saturation
+        self.exposure = exposure
+        self.valid_mode = valid_mode
+
+    def _rand_scale(self, s):
+        scale = random.uniform(1, s)
+        if (random.randint(1, 10000) % 2):
+            return scale
+        return 1. / scale
+
+    def _distort_image(self, im, hue, sat, val):
+        im = im.convert('HSV')
+        cs = list(im.split())
+        cs[1] = cs[1].point(lambda i: i * sat)
+        cs[2] = cs[2].point(lambda i: i * val)
+
+        def _change_hue(x):
+            x += hue * 255
+            if x > 255:
+                x -= 255
+            if x < 0:
+                x += 255
+            return x
+
+        cs[0] = cs[0].point(_change_hue)
+        im = Image.merge(im.mode, tuple(cs))
+
+        im = im.convert('RGB')
+        # constrain_image(im)
+        return im
+
+    def _random_distort_image(self, im, dhue, dsat, dexp):
+        res = self._distort_image(im, dhue, dsat, dexp)
+        return res
+
+    def _read_truths_args(self, lab_path, min_box_scale):
+        truths = np.loadtxt(lab_path)
+        truths = np.reshape(truths, (truths.size // 5, 5))
+        new_truths = []
+        for i in range(truths.shape[0]):
+            cx = (truths[i][1] + truths[i][3]) / (2 * 320)
+            cy = (truths[i][2] + truths[i][4]) / (2 * 240)
+            imgw = (truths[i][3] - truths[i][1]) / 320
+            imgh = (truths[i][4] - truths[i][2]) / 240
+            truths[i][0] = truths[i][0] - 1
+            truths[i][1] = cx
+            truths[i][2] = cy
+            truths[i][3] = imgw
+            truths[i][4] = imgh
+
+            if truths[i][3] < min_box_scale:
+                continue
+            new_truths.append([truths[i][0], truths[i][1], truths[i][2], truths[i][3], truths[i][4]])
+        return np.array(new_truths)
+
+    def _fill_truth_detection(self, labpath, flip, dx, dy, sx, sy):
+        max_boxes = 50
+        label = np.zeros((max_boxes, 5))
+        bs = np.loadtxt(labpath)
+        bs = np.reshape(bs, (-1, 5))
+
+        for i in range(bs.shape[0]):
+            cx = (bs[i][1] + bs[i][3]) / (2 * 320)
+            cy = (bs[i][2] + bs[i][4]) / (2 * 240)
+            imgw = (bs[i][3] - bs[i][1]) / 320
+            imgh = (bs[i][4] - bs[i][2]) / 240
+            bs[i][0] = bs[i][0] - 1
+            bs[i][1] = cx
+            bs[i][2] = cy
+            bs[i][3] = imgw
+            bs[i][4] = imgh
+
+        cc = 0
+        for i in range(bs.shape[0]):
+            x1 = bs[i][1] - bs[i][3] / 2
+            y1 = bs[i][2] - bs[i][4] / 2
+            x2 = bs[i][1] + bs[i][3] / 2
+            y2 = bs[i][2] + bs[i][4] / 2
+
+            x1 = min(0.999, max(0, x1 * sx - dx))
+            y1 = min(0.999, max(0, y1 * sy - dy))
+            x2 = min(0.999, max(0, x2 * sx - dx))
+            y2 = min(0.999, max(0, y2 * sy - dy))
+
+            bs[i][1] = (x1 + x2) / 2
+            bs[i][2] = (y1 + y2) / 2
+            bs[i][3] = (x2 - x1)
+            bs[i][4] = (y2 - y1)
+
+            if flip:
+                bs[i][1] = 0.999 - bs[i][1]
+
+            if bs[i][3] < 0.001 or bs[i][4] < 0.001:
+                continue
+            label[cc] = bs[i]
+            cc += 1
+            if cc >= 50:
+                break
+
+        label = np.reshape(label, (-1))
+        return label
+
+    def __call__(self, results):
+        clip = results['imgs']
+        frame_num = len(clip)
+        oh = clip[0].height
+        ow = clip[0].width
+        labpath = results['filename'].replace('jpg', 'txt').replace('rgb-images', 'labels')
+        if not self.valid_mode:
+            dw = int(ow * self.jitter)
+            dh = int(oh * self.jitter)
+
+            pleft = random.randint(-dw, dw)
+            pright = random.randint(-dw, dw)
+            ptop = random.randint(-dh, dh)
+            pbot = random.randint(-dh, dh)
+
+            swidth = ow - pleft - pright
+            sheight = oh - ptop - pbot
+
+            sx = float(swidth) / ow
+            sy = float(sheight) / oh
+
+            dx = (float(pleft) / ow) / sx
+            dy = (float(ptop) / oh) / sy
+
+            flip = random.randint(1, 10000) % 2
+
+            dhue = random.uniform(-self.hue, self.hue)
+            dsat = self._rand_scale(self.saturation)
+            dexp = self._rand_scale(self.exposure)
+
+            # Augment
+            cropped = [img.crop((pleft, ptop, pleft + swidth - 1, ptop + sheight - 1)) for img in clip]
+
+            sized = [img.resize(self.shape) for img in cropped]
+
+            if flip:
+                sized = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in sized]
+
+            clip = [self._random_distort_image(img, dhue, dsat, dexp) for img in sized]
+
+            label = self._fill_truth_detection(labpath, flip, dx, dy, 1. / sx, 1. / sy)
+
+        else:
+            label = np.zeros([50 * 5])
+            tmp = self._read_truths_args(labpath, 8.0 / clip[0].width).astype('float32')
+            tmp = np.reshape(tmp, [-1])
+            tsz = tmp.size
+            if tsz > 50 * 5:
+                label = tmp[0:50 * 5]
+            elif tsz > 0:
+                label[0:tsz] = tmp
+            clip = [img.resize(self.shape) for img in clip]
+
+        clip = [np.asarray(img).astype('float32') / 255.0 for img in clip]
+        clip = np.concatenate(clip, 0).reshape([frame_num, 224, 224, 3])
+        clip = np.transpose(clip, [3, 0, 1, 2])
+        results['imgs'] = clip
+        results['labels'] = label
+        return results
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/augmentations_ava.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/augmentations_ava.py
new file mode 100644
index 0000000..4f0c43d
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/augmentations_ava.py
@@ -0,0 +1,749 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import numpy as np
+import math
+from PIL import Image
+from ..registry import PIPELINES
+from collections.abc import Sequence
+import cv2
+
+pillow_interp_codes = {
+    'nearest': Image.NEAREST,
+    'bilinear': Image.BILINEAR,
+    'bicubic': Image.BICUBIC,
+    'box': Image.BOX,
+    'lanczos': Image.LANCZOS,
+    'hamming': Image.HAMMING
+}
+
+cv2_interp_codes = {
+    'nearest': cv2.INTER_NEAREST,
+    'bilinear': cv2.INTER_LINEAR,
+    'bicubic': cv2.INTER_CUBIC,
+    'area': cv2.INTER_AREA,
+    'lanczos': cv2.INTER_LANCZOS4
+}
+
+
+def _init_lazy_if_proper(results, lazy):
+    """Initialize lazy operation properly.
+
+    Make sure that a lazy operation is properly initialized,
+    and avoid a non-lazy operation accidentally getting mixed in.
+
+    Required keys in results are "imgs" if "img_shape" not in results,
+    otherwise, Required keys in results are "img_shape", add or modified keys
+    are "img_shape", "lazy".
+    Add or modified keys in "lazy" are "original_shape", "crop_bbox", "flip",
+    "flip_direction", "interpolation".
+
+    Args:
+        results (dict): A dict stores data pipeline result.
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+
+    if 'img_shape' not in results:
+        results['img_shape'] = results['imgs'][0].shape[:2]
+    if lazy:
+        if 'lazy' not in results:
+            img_h, img_w = results['img_shape']
+            lazyop = dict()
+            lazyop['original_shape'] = results['img_shape']
+            lazyop['crop_bbox'] = np.array([0, 0, img_w, img_h],
+                                           dtype=np.float32)
+            lazyop['flip'] = False
+            lazyop['flip_direction'] = None
+            lazyop['interpolation'] = None
+            results['lazy'] = lazyop
+    else:
+        assert 'lazy' not in results, 'Use Fuse after lazy operations'
+
+
+def _scale_size(size, scale):
+    """Rescale a size by a ratio.
+
+    Args:
+        size (tuple[int]): (w, h).
+        scale (float): Scaling factor.
+
+    Returns:
+        tuple[int]: scaled size.
+    """
+    w, h = size
+    return int(w * float(scale) + 0.5), int(h * float(scale) + 0.5)
+
+
+def rescale_size(old_size, scale, return_scale=False):
+    """Calculate the new size to be rescaled to.
+
+    Args:
+        old_size (tuple[int]): The old size (w, h) of image.
+        scale (float | tuple[int]): The scaling factor or maximum size.
+            If it is a float number, then the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, then the image will
+            be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image size.
+
+    Returns:
+        tuple[int]: The new rescaled image size.
+    """
+    w, h = old_size
+    if isinstance(scale, (float, int)):
+        if scale <= 0:
+            raise ValueError(f'Invalid scale {scale}, must be positive.')
+        scale_factor = scale
+    elif isinstance(scale, tuple):
+        max_long_edge = max(scale)
+        max_short_edge = min(scale)
+        scale_factor = min(max_long_edge / max(h, w),
+                           max_short_edge / min(h, w))
+    else:
+        raise TypeError(
+            f'Scale must be a number or tuple of int, but got {type(scale)}')
+
+    new_size = _scale_size((w, h), scale_factor)
+
+    if return_scale:
+        return new_size, scale_factor
+    else:
+        return new_size
+
+
+def imresize(img,
+             size,
+             return_scale=False,
+             interpolation='bilinear',
+             out=None,
+             backend=None):
+    """Resize image to a given size.  """
+    h, w = img.shape[:2]
+    if backend is None:
+        backend = 'cv2'
+    if backend not in ['cv2', 'pillow']:
+        raise ValueError(f'backend: {backend} is not supported for resize.'
+                         f"Supported backends are 'cv2', 'pillow'")
+
+    if backend == 'pillow':
+        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+        pil_image = Image.fromarray(img)
+        pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])
+        resized_img = np.array(pil_image)
+    else:
+        resized_img = cv2.resize(
+            img, size, dst=out, interpolation=cv2_interp_codes[interpolation])
+    if not return_scale:
+        return resized_img
+    else:
+        w_scale = size[0] / w
+        h_scale = size[1] / h
+        return resized_img, w_scale, h_scale
+
+
+@PIPELINES.register()
+class EntityBoxRescale:
+    """Rescale the entity box and proposals according to the image shape.
+
+    Required keys are "proposals", "gt_bboxes", added or modified keys are
+    "gt_bboxes". If original "proposals" is not None, "proposals" and
+    will be added or modified.
+
+    Args:
+        scale_factor (np.ndarray): The scale factor used entity_box rescaling.
+    """
+
+    def __init__(self, scale_factor):
+        self.scale_factor = scale_factor
+
+    def __call__(self, results):
+        scale_factor = np.concatenate([self.scale_factor, self.scale_factor])
+
+        if 'gt_bboxes' in results:
+            gt_bboxes = results['gt_bboxes']
+            results['gt_bboxes'] = gt_bboxes * scale_factor
+
+        if 'proposals' in results:
+            proposals = results['proposals']
+            if proposals is not None:
+                assert proposals.shape[1] == 4, (
+                    'proposals shape should be in '
+                    f'(n, 4), but got {proposals.shape}')
+                results['proposals'] = proposals * scale_factor
+
+        return results
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}(scale_factor={self.scale_factor})'
+
+
+@PIPELINES.register()
+class EntityBoxCrop:
+    """Crop the entity boxes and proposals according to the cropped images.
+
+    Required keys are "proposals", "gt_bboxes", added or modified keys are
+    "gt_bboxes". If original "proposals" is not None, "proposals" will be
+    modified.
+
+    Args:
+        crop_bbox(np.ndarray | None): The bbox used to crop the original image.
+    """
+
+    def __init__(self, crop_bbox):
+        self.crop_bbox = crop_bbox
+
+    def __call__(self, results):
+        proposals = results['proposals']
+        gt_bboxes = results['gt_bboxes']
+
+        if self.crop_bbox is None:
+            return results
+
+        x1, y1, x2, y2 = self.crop_bbox
+        img_w, img_h = x2 - x1, y2 - y1
+
+        assert gt_bboxes.shape[-1] == 4
+        gt_bboxes_ = gt_bboxes.copy()
+        gt_bboxes_[..., 0::2] = np.clip(gt_bboxes[..., 0::2] - x1, 0, img_w - 1)
+        gt_bboxes_[..., 1::2] = np.clip(gt_bboxes[..., 1::2] - y1, 0, img_h - 1)
+        results['gt_bboxes'] = gt_bboxes_
+
+        if proposals is not None:
+            assert proposals.shape[-1] == 4
+            proposals_ = proposals.copy()
+            proposals_[..., 0::2] = np.clip(proposals[..., 0::2] - x1, 0,
+                                            img_w - 1)
+            proposals_[..., 1::2] = np.clip(proposals[..., 1::2] - y1, 0,
+                                            img_h - 1)
+            results['proposals'] = proposals_
+        return results
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}(crop_bbox={self.crop_bbox})'
+
+
+@PIPELINES.register()
+class EntityBoxFlip:
+    """Flip the entity boxes and proposals with a probability.
+
+    Reverse the order of elements in the given bounding boxes and proposals
+    with a specific direction. The shape of them are preserved, but the
+    elements are reordered. Only the horizontal flip is supported (seems
+    vertical flipping makes no sense). Required keys are "proposals",
+    "gt_bboxes", added or modified keys are "gt_bboxes". If "proposals"
+    is not None, it will also be modified.
+
+    Args:
+        img_shape (tuple[int]): The img shape.
+    """
+
+    def __init__(self, img_shape):
+        self.img_shape = img_shape
+
+    def __call__(self, results):
+        proposals = results['proposals']
+        gt_bboxes = results['gt_bboxes']
+        img_h, img_w = self.img_shape
+
+        assert gt_bboxes.shape[-1] == 4
+        gt_bboxes_ = gt_bboxes.copy()
+        gt_bboxes_[..., 0::4] = img_w - gt_bboxes[..., 2::4] - 1
+        gt_bboxes_[..., 2::4] = img_w - gt_bboxes[..., 0::4] - 1
+        if proposals is not None:
+            assert proposals.shape[-1] == 4
+            proposals_ = proposals.copy()
+            proposals_[..., 0::4] = img_w - proposals[..., 2::4] - 1
+            proposals_[..., 2::4] = img_w - proposals[..., 0::4] - 1
+        else:
+            proposals_ = None
+
+        results['proposals'] = proposals_
+        results['gt_bboxes'] = gt_bboxes_
+
+        return results
+
+    def __repr__(self):
+        repr_str = f'{self.__class__.__name__}(img_shape={self.img_shape})'
+        return repr_str
+
+
+@PIPELINES.register()
+class Resize:
+    """Resize images to a specific size.
+
+    Required keys are "imgs", "img_shape", "modality", added or modified
+    keys are "imgs", "img_shape", "keep_ratio", "scale_factor", "lazy",
+    "resize_size". Required keys in "lazy" is None, added or modified key is
+    "interpolation".
+
+    Args:
+        scale (float | Tuple[int]): If keep_ratio is True, it serves as scaling
+            factor or maximum size:
+            If it is a float number, the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, the image will
+            be rescaled as large as possible within the scale.
+            Otherwise, it serves as (w, h) of output size.
+        keep_ratio (bool): If set to True, Images will be resized without
+            changing the aspect ratio. Otherwise, it will resize images to a
+            given size. Default: True.
+        interpolation (str): Algorithm used for interpolation:
+            "nearest" | "bilinear". Default: "bilinear".
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+
+    def __init__(self,
+                 scale,
+                 keep_ratio=True,
+                 interpolation='bilinear',
+                 lazy=False):
+        if isinstance(scale, str):
+            scale = eval(scale)
+        if isinstance(scale, float):
+            if scale <= 0:
+                raise ValueError(f'Invalid scale {scale}, must be positive.')
+        elif isinstance(scale, tuple):
+            max_long_edge = max(scale)
+            max_short_edge = min(scale)
+            if max_short_edge == -1:
+                # assign np.inf to long edge for rescaling short edge later.
+                scale = (np.inf, max_long_edge)
+        else:
+            raise TypeError(
+                f'Scale must be float or tuple of int, but got {type(scale)}')
+        self.scale = scale
+        self.keep_ratio = keep_ratio
+        self.interpolation = interpolation
+        self.lazy = lazy
+
+    def __call__(self, results):
+        """Performs the Resize augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+
+        _init_lazy_if_proper(results, self.lazy)
+
+        if 'scale_factor' not in results:
+            results['scale_factor'] = np.array([1, 1], dtype=np.float32)
+        img_h, img_w = results['img_shape']
+
+        if self.keep_ratio:
+            new_w, new_h = rescale_size((img_w, img_h), self.scale)
+        else:
+            new_w, new_h = self.scale
+
+        self.scale_factor = np.array([new_w / img_w, new_h / img_h],
+                                     dtype=np.float32)
+        results['img_shape'] = (new_h, new_w)
+        results['keep_ratio'] = self.keep_ratio
+        results['scale_factor'] = results['scale_factor'] * self.scale_factor
+
+        if not self.lazy:
+            if 'imgs' in results:
+                results['imgs'] = [
+                    imresize(
+                        img, (new_w, new_h), interpolation=self.interpolation)
+                    for img in results['imgs']
+                ]
+            if 'keypoint' in results:
+                results['keypoint'] = results['keypoint'] * self.scale_factor
+        else:
+            lazyop = results['lazy']
+            if lazyop['flip']:
+                raise NotImplementedError('Put Flip at last for now')
+            lazyop['interpolation'] = self.interpolation
+
+        #if 'gt_bboxes' in results:
+        assert not self.lazy
+        entity_box_rescale = EntityBoxRescale(self.scale_factor)
+        results = entity_box_rescale(results)
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'scale={self.scale}, keep_ratio={self.keep_ratio}, '
+                    f'interpolation={self.interpolation}, '
+                    f'lazy={self.lazy})')
+        return repr_str
+
+
+@PIPELINES.register()
+class RandomRescale:
+    """Randomly resize images so that the short_edge is resized to a specific
+    size in a given range. The scale ratio is unchanged after resizing.
+    """
+
+    def __init__(self, scale_range, interpolation='bilinear'):
+        scale_range = eval(scale_range)
+        self.scale_range = scale_range
+
+        assert len(scale_range) == 2
+        assert scale_range[0] < scale_range[1]
+        assert np.all([x > 0 for x in scale_range])
+
+        self.keep_ratio = True
+        self.interpolation = interpolation
+
+    def __call__(self, results):
+        """Performs the Resize augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        short_edge = np.random.randint(self.scale_range[0],
+                                       self.scale_range[1] + 1)
+        resize = Resize((-1, short_edge),
+                        keep_ratio=True,
+                        interpolation=self.interpolation,
+                        lazy=False)
+        results = resize(results)
+
+        results['short_edge'] = short_edge
+        return results
+
+    def __repr__(self):
+        scale_range = self.scale_range
+        repr_str = (f'{self.__class__.__name__}('
+                    f'scale_range=({scale_range[0]}, {scale_range[1]}), '
+                    f'interpolation={self.interpolation})')
+        return repr_str
+
+
+@PIPELINES.register()
+class Rescale:
+    """resize images so that the short_edge is resized to a specific
+    size in a given range. The scale ratio is unchanged after resizing.
+
+    Required keys are "imgs", "img_shape", "modality", added or modified
+    keys are "imgs", "img_shape", "keep_ratio", "scale_factor", "resize_size",
+    "short_edge".
+
+    Args:
+        scale_range (tuple[int]): The range of short edge length. A closed
+            interval.
+        interpolation (str): Algorithm used for interpolation:
+            "nearest" | "bilinear". Default: "bilinear".
+    """
+
+    def __init__(self, scale_range, interpolation='bilinear'):
+        scale_range = eval(scale_range)
+        self.scale_range = scale_range
+
+        self.keep_ratio = True
+        self.interpolation = interpolation
+
+    def __call__(self, results):
+        """Performs the Resize augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        resize = Resize(
+            self.scale_range,
+            keep_ratio=True,
+            interpolation=self.interpolation,
+            lazy=False)
+        results = resize(results)
+        return results
+
+    def __repr__(self):
+        scale_range = self.scale_range
+        repr_str = (f'{self.__class__.__name__}('
+                    f'scale_range=({scale_range[0]}, {scale_range[1]}), '
+                    f'interpolation={self.interpolation})')
+        return repr_str
+
+
+@PIPELINES.register()
+class RandomCrop_v2:
+    """Vanilla square random crop that specifics the output size.
+
+    Required keys in results are "imgs" and "img_shape", added or
+    modified keys are "imgs", "lazy"; Required keys in "lazy" are "flip",
+    "crop_bbox", added or modified key is "crop_bbox".
+
+    Args:
+        size (int): The output size of the images.
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+
+    def __init__(self, size, lazy=False):
+        if not isinstance(size, int):
+            raise TypeError(f'Size must be an int, but got {type(size)}')
+        self.size = size
+        self.lazy = lazy
+
+    def __call__(self, results):
+        """Performs the RandomCrop augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        _init_lazy_if_proper(results, self.lazy)
+
+        img_h, img_w = results['img_shape']
+        assert self.size <= img_h and self.size <= img_w
+
+        y_offset = 0
+        x_offset = 0
+        if img_h > self.size:
+            y_offset = int(np.random.randint(0, img_h - self.size))
+        if img_w > self.size:
+            x_offset = int(np.random.randint(0, img_w - self.size))
+        if 'crop_quadruple' not in results:
+            results['crop_quadruple'] = np.array(
+                [0, 0, 1, 1],  # x, y, w, h
+                dtype=np.float32)
+
+        x_ratio, y_ratio = x_offset / img_w, y_offset / img_h
+        w_ratio, h_ratio = self.size / img_w, self.size / img_h
+
+        old_crop_quadruple = results['crop_quadruple']
+        old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1]
+        old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3]
+        new_crop_quadruple = [
+            old_x_ratio + x_ratio * old_w_ratio,
+            old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio,
+            h_ratio * old_x_ratio
+        ]
+        results['crop_quadruple'] = np.array(
+            new_crop_quadruple, dtype=np.float32)
+
+        new_h, new_w = self.size, self.size
+
+        results['crop_bbox'] = np.array(
+            [x_offset, y_offset, x_offset + new_w, y_offset + new_h])
+        results['img_shape'] = (new_h, new_w)
+
+        if not self.lazy:
+            results['imgs'] = [
+                img[y_offset:y_offset + new_h, x_offset:x_offset + new_w]
+                for img in results['imgs']
+            ]
+        else:
+            lazyop = results['lazy']
+            if lazyop['flip']:
+                raise NotImplementedError('Put Flip at last for now')
+
+            # record crop_bbox in lazyop dict to ensure only crop once in Fuse
+            lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox']
+            left = x_offset * (lazy_right - lazy_left) / img_w
+            right = (x_offset + new_w) * (lazy_right - lazy_left) / img_w
+            top = y_offset * (lazy_bottom - lazy_top) / img_h
+            bottom = (y_offset + new_h) * (lazy_bottom - lazy_top) / img_h
+            lazyop['crop_bbox'] = np.array(
+                [(lazy_left + left), (lazy_top + top), (lazy_left + right),
+                 (lazy_top + bottom)],
+                dtype=np.float32)
+
+        # Process entity boxes
+        if 'gt_bboxes' in results:
+            assert not self.lazy
+            entity_box_crop = EntityBoxCrop(results['crop_bbox'])
+            results = entity_box_crop(results)
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}(size={self.size}, '
+                    f'lazy={self.lazy})')
+        return repr_str
+
+
+def imflip_(img, direction='horizontal'):
+    """Inplace flip an image horizontally or vertically.
+
+    Args:
+        img (ndarray): Image to be flipped.
+        direction (str): The flip direction, either "horizontal" or
+            "vertical" or "diagonal".
+
+    Returns:
+        ndarray: The flipped image (inplace).
+    """
+    assert direction in ['horizontal', 'vertical', 'diagonal']
+    if direction == 'horizontal':
+        return cv2.flip(img, 1, img)
+    elif direction == 'vertical':
+        return cv2.flip(img, 0, img)
+    else:
+        return cv2.flip(img, -1, img)
+
+
+def iminvert(img):
+    """Invert (negate) an image.
+
+    Args:
+        img (ndarray): Image to be inverted.
+
+    Returns:
+        ndarray: The inverted image.
+    """
+    return np.full_like(img, 255) - img
+
+
+@PIPELINES.register()
+class Flip:
+    """Flip the input images with a probability.
+
+    Reverse the order of elements in the given imgs with a specific direction.
+    The shape of the imgs is preserved, but the elements are reordered.
+    Required keys are "imgs", "img_shape", "modality", added or modified
+    keys are "imgs", "lazy" and "flip_direction". Required keys in "lazy" is
+    None, added or modified key are "flip" and "flip_direction". The Flip
+    augmentation should be placed after any cropping / reshaping augmentations,
+    to make sure crop_quadruple is calculated properly.
+
+    Args:
+        flip_ratio (float): Probability of implementing flip. Default: 0.5.
+        direction (str): Flip imgs horizontally or vertically. Options are
+            "horizontal" | "vertical". Default: "horizontal".
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+    _directions = ['horizontal', 'vertical']
+
+    def __init__(self, flip_ratio=0.5, direction='horizontal', lazy=False):
+        if direction not in self._directions:
+            raise ValueError(f'Direction {direction} is not supported. '
+                             f'Currently support ones are {self._directions}')
+        self.flip_ratio = flip_ratio
+        self.direction = direction
+        self.lazy = lazy
+
+    def __call__(self, results):
+        """Performs the Flip augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        _init_lazy_if_proper(results, self.lazy)
+        flip = np.random.rand() < self.flip_ratio
+
+        results['flip'] = flip
+        results['flip_direction'] = self.direction
+
+        if not self.lazy:
+            if flip:
+                for i, img in enumerate(results['imgs']):
+                    imflip_(img, self.direction)
+                lt = len(results['imgs'])
+            else:
+                results['imgs'] = list(results['imgs'])
+        else:
+            lazyop = results['lazy']
+            if lazyop['flip']:
+                raise NotImplementedError('Use one Flip please')
+            lazyop['flip'] = flip
+            lazyop['flip_direction'] = self.direction
+
+        if 'gt_bboxes' in results and flip:
+            assert not self.lazy and self.direction == 'horizontal'
+            entity_box_flip = EntityBoxFlip(results['img_shape'])
+            results = entity_box_flip(results)
+
+        return results
+
+    def __repr__(self):
+        repr_str = (
+            f'{self.__class__.__name__}('
+            f'flip_ratio={self.flip_ratio}, direction={self.direction}, '
+            f'lazy={self.lazy})')
+        return repr_str
+
+
+def imnormalize_(img, mean, std, to_rgb=True):
+    """Inplace normalize an image with mean and std.
+
+    Args:
+        img (ndarray): Image to be normalized.
+        mean (ndarray): The mean to be used for normalize.
+        std (ndarray): The std to be used for normalize.
+        to_rgb (bool): Whether to convert to rgb.
+
+    Returns:
+        ndarray: The normalized image.
+    """
+    # cv2 inplace normalization does not accept uint8
+    assert img.dtype != np.uint8
+    mean = np.float64(mean.reshape(1, -1))
+    stdinv = 1 / np.float64(std.reshape(1, -1))
+    if to_rgb:
+        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)  # inplace
+    cv2.subtract(img, mean, img)  # inplace
+    cv2.multiply(img, stdinv, img)  # inplace
+    return img
+
+
+@PIPELINES.register()
+class Normalize:
+    """Normalize images with the given mean and std value.
+
+    Required keys are "imgs", "img_shape", "modality", added or modified
+    keys are "imgs" and "img_norm_cfg". If modality is 'Flow', additional
+    keys "scale_factor" is required
+
+    Args:
+        mean (Sequence[float]): Mean values of different channels.
+        std (Sequence[float]): Std values of different channels.
+        to_bgr (bool): Whether to convert channels from RGB to BGR.
+            Default: False.
+        adjust_magnitude (bool): Indicate whether to adjust the flow magnitude
+            on 'scale_factor' when modality is 'Flow'. Default: False.
+    """
+
+    def __init__(self, mean, std, to_bgr=False, adjust_magnitude=False):
+        if not isinstance(mean, Sequence):
+            raise TypeError(
+                f'Mean must be list, tuple or np.ndarray, but got {type(mean)}')
+
+        if not isinstance(std, Sequence):
+            raise TypeError(
+                f'Std must be list, tuple or np.ndarray, but got {type(std)}')
+
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.to_bgr = to_bgr
+        self.adjust_magnitude = adjust_magnitude
+
+    def __call__(self, results):
+        n = len(results['imgs'])
+        h, w, c = results['imgs'][0].shape
+        imgs = np.empty((n, h, w, c), dtype=np.float32)
+        for i, img in enumerate(results['imgs']):
+            imgs[i] = img
+
+        for img in imgs:
+            imnormalize_(img, self.mean, self.std, self.to_bgr)
+
+        results['imgs'] = imgs
+        results['img_norm_cfg'] = dict(
+            mean=self.mean, std=self.std, to_bgr=self.to_bgr)
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'mean={self.mean}, '
+                    f'std={self.std}, '
+                    f'to_bgr={self.to_bgr}, '
+                    f'adjust_magnitude={self.adjust_magnitude})')
+        return repr_str
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/compose.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/compose.py
new file mode 100644
index 0000000..76eb4ed
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/compose.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Sequence
+from ..registry import PIPELINES
+import traceback
+from ...utils import build
+from ...utils import get_logger
+
+
+@PIPELINES.register()
+class Compose(object):
+    """
+    Composes several pipelines(include decode func, sample func, and transforms) together.
+
+    Note: To deal with ```list``` type cfg temporaray, like:
+
+        transform:
+            - Crop: # A list
+                attribute: 10
+            - Resize: # A list
+                attribute: 20
+
+    every key of list will pass as the key name to build a module.
+    XXX: will be improved in the future.
+
+    Args:
+        pipelines (list): List of transforms to compose.
+    Returns:
+        A compose object which is callable, __call__ for this Compose
+        object will call each given :attr:`transforms` sequencely.
+    """
+    def __init__(self, pipelines):
+        #assert isinstance(pipelines, Sequence)
+        self.pipelines = []
+        for p in pipelines.values():
+            if isinstance(p, dict):
+                p = build(p, PIPELINES)
+                self.pipelines.append(p)
+            elif isinstance(p, list):
+                for t in p:
+                    #XXX: to deal with old format cfg, ugly code here!
+                    temp_dict = dict(name=list(t.keys())[0])
+                    for all_sub_t in t.values():
+                        if all_sub_t is not None:
+                            temp_dict.update(all_sub_t) 
+      
+                    t = build(temp_dict, PIPELINES)
+                    self.pipelines.append(t)
+            elif callable(p):
+                self.pipelines.append(p)
+            else:
+                raise TypeError(f'pipelines must be callable or a dict,'
+                                f'but got {type(p)}')
+    def __call__(self, data):
+        for p in self.pipelines:
+            try:
+                data = p(data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger = get_logger("paddlevideo")
+                logger.info("fail to perform transform [{}] with error: "
+                      "{} and stack:\n{}".format(p, e, str(stack_info)))
+                raise e
+        return data
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode.py
new file mode 100644
index 0000000..2611272
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode.py
@@ -0,0 +1,348 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+try:
+    import av
+except ImportError as e:
+    print(
+        f"Warning! {e}, [av] package and it's dependencies is required for TimeSformer and other models."
+    )
+import cv2
+import pickle
+import decord as de
+import math
+import random
+from ..registry import PIPELINES
+
+
+def get_start_end_idx(video_size, clip_size, clip_idx, num_clips):
+    delta = max(video_size - clip_size, 0)
+    if clip_idx == -1:  # here
+        # Random temporal sampling.
+        start_idx = random.uniform(0, delta)
+    else:  # ignore
+        # Uniformly sample the clip with the given index.
+        start_idx = delta * clip_idx / num_clips
+    end_idx = start_idx + clip_size - 1
+    return start_idx, end_idx
+
+
+@PIPELINES.register()
+class VideoDecoder(object):
+    """
+    Decode mp4 file to frames.
+    Args:
+        filepath: the file path of mp4 file
+    """
+    def __init__(self,
+                 backend='cv2',
+                 mode='train',
+                 sampling_rate=32,
+                 num_seg=8,
+                 num_clips=1,
+                 target_fps=30):
+
+        self.backend = backend
+        # params below only for TimeSformer
+        self.mode = mode
+        self.sampling_rate = sampling_rate
+        self.num_seg = num_seg
+        self.num_clips = num_clips
+        self.target_fps = target_fps
+
+    def __call__(self, results):
+        """
+        Perform mp4 decode operations.
+        return:
+            List where each item is a numpy array after decoder.
+        """
+        file_path = results['filename']
+        results['format'] = 'video'
+        results['backend'] = self.backend
+
+        if self.backend == 'cv2':
+            cap = cv2.VideoCapture(file_path)
+            videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            sampledFrames = []
+            for i in range(videolen):
+                ret, frame = cap.read()
+                # maybe first frame is empty
+                if ret == False:
+                    continue
+                img = frame[:, :, ::-1]
+                sampledFrames.append(img)
+            results['frames'] = sampledFrames
+            results['frames_len'] = len(sampledFrames)
+
+        elif self.backend == 'decord':
+            container = de.VideoReader(file_path)
+            frames_len = len(container)
+            results['frames'] = container
+            results['frames_len'] = frames_len
+
+        elif self.backend == 'pyav':  # for TimeSformer
+            if self.mode in ["train", "valid"]:
+                clip_idx = -1
+            elif self.mode in ["test"]:
+                clip_idx = 0
+            else:
+                raise NotImplementedError
+
+            container = av.open(file_path)
+
+            num_clips = 1  # always be 1
+
+            # decode process
+            fps = float(container.streams.video[0].average_rate)
+
+            frames_length = container.streams.video[0].frames
+            duration = container.streams.video[0].duration
+
+            if duration is None:
+                # If failed to fetch the decoding information, decode the entire video.
+                decode_all_video = True
+                video_start_pts, video_end_pts = 0, math.inf
+            else:
+                decode_all_video = False
+                start_idx, end_idx = get_start_end_idx(
+                    frames_length,
+                    self.sampling_rate * self.num_seg / self.target_fps * fps,
+                    clip_idx, num_clips)
+                timebase = duration / frames_length
+                video_start_pts = int(start_idx * timebase)
+                video_end_pts = int(end_idx * timebase)
+
+            frames = None
+            # If video stream was found, fetch video frames from the video.
+            if container.streams.video:
+                margin = 1024
+                seek_offset = max(video_start_pts - margin, 0)
+
+                container.seek(seek_offset,
+                               any_frame=False,
+                               backward=True,
+                               stream=container.streams.video[0])
+                tmp_frames = {}
+                buffer_count = 0
+                max_pts = 0
+                for frame in container.decode(**{"video": 0}):
+                    max_pts = max(max_pts, frame.pts)
+                    if frame.pts < video_start_pts:
+                        continue
+                    if frame.pts <= video_end_pts:
+                        tmp_frames[frame.pts] = frame
+                    else:
+                        buffer_count += 1
+                        tmp_frames[frame.pts] = frame
+                        if buffer_count >= 0:
+                            break
+                video_frames = [tmp_frames[pts] for pts in sorted(tmp_frames)]
+
+                container.close()
+
+                frames = [frame.to_rgb().to_ndarray() for frame in video_frames]
+                clip_sz = self.sampling_rate * self.num_seg / self.target_fps * fps
+
+                start_idx, end_idx = get_start_end_idx(
+                    len(frames),  # frame_len
+                    clip_sz,
+                    clip_idx if decode_all_video else
+                    0,  # If decode all video, -1 in train and valid, 0 in test;
+                    # else, always 0 in train, valid and test, as we has selected clip size frames when decode.
+                    1)
+                results['frames'] = frames
+                results['frames_len'] = len(frames)
+                results['start_idx'] = start_idx
+                results['end_idx'] = end_idx
+        else:
+            raise NotImplementedError
+            # pass
+        return results
+
+
+@PIPELINES.register()
+class FrameDecoder(object):
+    """just parse results
+    """
+    def __init__(self):
+        pass
+
+    def __call__(self, results):
+        results['format'] = 'frame'
+        return results
+
+
+@PIPELINES.register()
+class MRIDecoder(object):
+    """just parse results
+    """
+    def __init__(self):
+        pass
+
+    def __call__(self, results):
+        results['format'] = 'MRI'
+        return results
+
+
+@PIPELINES.register()
+class FeatureDecoder(object):
+    """
+        Perform feature decode operations.e.g.youtube8m
+    """
+    def __init__(self, num_classes, max_len=512, has_label=True):
+        self.max_len = max_len
+        self.num_classes = num_classes
+        self.has_label = has_label
+
+    def __call__(self, results):
+        """
+        Perform feature decode operations.
+        return:
+            List where each item is a numpy array after decoder.
+        """
+        #1. load pkl
+        #2. parse to rgb/audio/
+        #3. padding
+
+        filepath = results['filename']
+        data = pickle.load(open(filepath, 'rb'), encoding='bytes')
+
+        record = data
+        nframes = record['nframes'] if 'nframes' in record else record[
+            b'nframes']
+        rgb = record['feature'].astype(
+            float) if 'feature' in record else record[b'feature'].astype(float)
+        audio = record['audio'].astype(
+            float) if 'audio' in record else record[b'audio'].astype(float)
+        if self.has_label:
+            label = record['label'] if 'label' in record else record[b'label']
+            one_hot_label = self.make_one_hot(label, self.num_classes)
+
+        rgb = rgb[0:nframes, :]
+        audio = audio[0:nframes, :]
+
+        rgb = self.dequantize(rgb,
+                              max_quantized_value=2.,
+                              min_quantized_value=-2.)
+        audio = self.dequantize(audio,
+                                max_quantized_value=2,
+                                min_quantized_value=-2)
+
+        if self.has_label:
+            results['labels'] = one_hot_label.astype("float32")
+
+        feat_pad_list = []
+        feat_len_list = []
+        mask_list = []
+        vitem = [rgb, audio]
+        for vi in range(2):  #rgb and audio
+            if vi == 0:
+                prefix = "rgb_"
+            else:
+                prefix = "audio_"
+            feat = vitem[vi]
+            results[prefix + 'len'] = feat.shape[0]
+            #feat pad step 1. padding
+            feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]),
+                                dtype=np.float32)
+            feat_pad = np.concatenate((feat, feat_add), axis=0)
+            results[prefix + 'data'] = feat_pad.astype("float32")
+            #feat pad step 2. mask
+            feat_mask_origin = np.ones(feat.shape, dtype=np.float32)
+            feat_mask_add = feat_add
+            feat_mask = np.concatenate((feat_mask_origin, feat_mask_add),
+                                       axis=0)
+            results[prefix + 'mask'] = feat_mask.astype("float32")
+
+        return results
+
+    def dequantize(self,
+                   feat_vector,
+                   max_quantized_value=2.,
+                   min_quantized_value=-2.):
+        """
+        Dequantize the feature from the byte format to the float format
+        """
+
+        assert max_quantized_value > min_quantized_value
+        quantized_range = max_quantized_value - min_quantized_value
+        scalar = quantized_range / 255.0
+        bias = (quantized_range / 512.0) + min_quantized_value
+
+        return feat_vector * scalar + bias
+
+    def make_one_hot(self, label, dim=3862):
+        one_hot_label = np.zeros(dim)
+        one_hot_label = one_hot_label.astype(float)
+        for ind in label:
+            one_hot_label[int(ind)] = 1
+        return one_hot_label
+
+
+@PIPELINES.register()
+class ActionFeatureDecoder(object):
+    """
+        Perform feature decode operations on footballaction
+    """
+    def __init__(self, num_classes, max_len=512, has_label=True):
+        self.max_len = max_len
+        self.num_classes = num_classes
+        self.has_label = has_label
+
+    def __call__(self, results):
+        """
+        Perform feature decode operations.
+        return:
+            List where each item is a numpy array after decoder.
+        """
+        #1. load pkl
+        #2. parse to rgb/audio/
+        #3. padding
+
+        filepath = results['filename']
+        data = pickle.load(open(filepath, 'rb'), encoding='bytes')
+
+        pkl_data = data
+        rgb = pkl_data['image_feature'].astype(float)
+        audio = pkl_data['audio_feature'].astype(float)
+        label_id_info = pkl_data['label_info']
+        label_cls = [label_id_info['label']]
+        label_one = int(label_cls[0])
+        if len(label_cls) > 1:
+            label_index = random.randint(0, 1)
+            label_one = int(label_cls[label_index])
+        iou_norm = float(label_id_info['norm_iou'])
+        results['labels'] = np.array([label_one])
+        results['iou_norm'] = float(iou_norm)
+
+        vitem = [rgb, audio]
+        for vi in range(2):  #rgb and audio
+            if vi == 0:
+                prefix = "rgb_"
+            else:
+                prefix = "audio_"
+            feat = vitem[vi]
+            results[prefix + 'len'] = feat.shape[0]
+            #feat pad step 1. padding
+            feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]),
+                                dtype=np.float32)
+            feat_pad = np.concatenate((feat, feat_add), axis=0)
+            results[prefix + 'data'] = feat_pad.astype("float32")
+            #feat pad step 2. mask
+            feat_mask_origin = np.ones(feat.shape, dtype=np.float32)
+            feat_mask = np.concatenate((feat_mask_origin, feat_add), axis=0)
+            results[prefix + 'mask'] = feat_mask.astype("float32")
+
+        return results
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_image.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_image.py
new file mode 100644
index 0000000..64a7e2f
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_image.py
@@ -0,0 +1,206 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+import PIL.Image as pil
+
+try:
+    import skimage.transform
+except ImportError as e:
+    print(
+        f"Warning! {e}, [scikit-image] package and it's dependencies is required for ADDS."
+    )
+from PIL import Image
+
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class ImageDecoder(object):
+    """Decode Image
+    """
+    def __init__(self,
+                 dataset,
+                 frame_idxs,
+                 num_scales,
+                 side_map,
+                 full_res_shape,
+                 img_ext,
+                 backend='cv2'):
+        self.backend = backend
+        self.dataset = dataset
+        self.frame_idxs = frame_idxs
+        self.num_scales = num_scales
+        self.side_map = side_map
+        self.full_res_shape = full_res_shape
+        self.img_ext = img_ext
+
+    def _pil_loader(self, path):
+        with open(path, 'rb') as f:
+            with Image.open(f) as img:
+                return img.convert('RGB')
+
+    def get_color(self, folder, frame_index, side):
+        color = self._pil_loader(
+            self.get_image_path(self.dataset, folder, frame_index, side))
+        return color
+
+    def get_image_path(self, dataset, folder, frame_index, side):
+        if dataset == "kitti":
+            f_str = "{:010d}{}".format(frame_index, self.img_ext)
+            image_path = os.path.join(self.data_path, folder, f_str)
+        elif dataset == "kitti_odom":
+            f_str = "{:06d}{}".format(frame_index, self.img_ext)
+            image_path = os.path.join(self.data_path,
+                                      "sequences/{:02d}".format(int(folder)),
+                                      "image_{}".format(self.side_map[side]),
+                                      f_str)
+        elif dataset == "kitti_depth":
+            f_str = "{:010d}{}".format(frame_index, self.img_ext)
+            image_path = os.path.join(
+                self.data_path, folder,
+                "image_0{}/data".format(self.side_map[side]), f_str)
+
+        return image_path
+
+    def get_depth(self, dataset, folder, frame_index, side):
+        if dataset == "kitii_depth":
+            f_str = "{:010d}.png".format(frame_index)
+            depth_path = os.path.join(
+                self.data_path, folder,
+                "proj_depth/groundtruth/image_0{}".format(self.side_map[side]),
+                f_str)
+
+            depth_gt = pil.open(depth_path)
+            depth_gt = depth_gt.resize(self.full_res_shape, pil.NEAREST)
+            depth_gt = np.array(depth_gt).astype(np.float32) / 256
+
+        else:
+            f_str = "{:010d}{}".format(frame_index, self.img_ext)
+            depth_path = os.path.join(self.data_path, folder + '_gt', f_str)
+
+            img_file = Image.open(depth_path)
+            depth_png = np.array(img_file, dtype=int)
+            img_file.close()
+            # make sure we have a proper 16bit depth map here.. not 8bit!
+            assert np.max(depth_png) > 255, \
+                "np.max(depth_png)={}, path={}".format(np.max(depth_png), depth_path)
+
+            depth_gt = depth_png.astype(np.float) / 256.
+
+            depth_gt = depth_gt[160:960 - 160, :]
+
+            depth_gt = skimage.transform.resize(depth_gt,
+                                                self.full_res_shape[::-1],
+                                                order=0,
+                                                preserve_range=True,
+                                                mode='constant')
+
+        return depth_gt
+
+    def __call__(self, results):
+        """
+        Perform mp4 decode operations.
+        return:
+            List where each item is a numpy array after decoder.
+        """
+        if results.get('mode', None) == 'infer':
+            imgs = {}
+            imgs[("color", 0,
+                  -1)] = Image.open(results["filename"]).convert("RGB")
+            results['imgs'] = imgs
+            return results
+
+        self.data_path = results['data_path']
+        results['backend'] = self.backend
+
+        imgs = {}
+
+        results['frame_idxs'] = self.frame_idxs
+        results['num_scales'] = self.num_scales
+
+        file_name = results['filename']
+        folder = results['folder']
+        frame_index = results['frame_index']
+        line = file_name.split('/')
+        istrain = folder.split('_')[1]
+        if 'mode' not in results:
+            results['mode'] = istrain
+        results['day_or_night'] = folder.split('_')[0]
+
+        if istrain == "train":
+            if folder[0] == 'd':
+                folder2 = folder + '_fake_night'
+                flag = 0
+            else:
+                folder2 = folder + '_fake_day'
+                tmp = folder
+                folder = folder2
+                folder2 = tmp
+                flag = 1
+
+            if len(line) == 3:
+                side = line[2]
+            else:
+                side = None
+
+            results['side'] = side
+
+            for i in self.frame_idxs:
+
+                if i == "s":
+                    other_side = {"r": "l", "l": "r"}[side]
+                    imgs[("color", i,
+                          -1)] = self.get_color(folder, frame_index, other_side)
+                    imgs[("color_n", i,
+                          -1)] = self.get_color(folder2, frame_index,
+                                                other_side)
+                else:
+                    imgs[("color", i,
+                          -1)] = self.get_color(folder, frame_index + i, side)
+                    imgs[("color_n", i,
+                          -1)] = self.get_color(folder2, frame_index + i, side)
+
+            istrain = folder.split('_')[1]
+            if istrain != 'train':
+                if flag:
+                    depth_gt = self.get_depth(folder2, frame_index, side)
+                else:
+                    depth_gt = self.get_depth(folder, frame_index, side)
+                imgs["depth_gt"] = np.expand_dims(depth_gt, 0)
+        elif istrain == 'val':
+            if len(line) == 3:
+                side = line[2]
+            else:
+                side = None
+
+            for i in self.frame_idxs:
+                if i == "s":
+                    other_side = {"r": "l", "l": "r"}[side]
+                    imgs[("color", i,
+                          -1)] = self.get_color(folder, frame_index, other_side)
+                else:
+
+                    imgs[("color", i,
+                          -1)] = self.get_color(folder, frame_index + i, side)
+
+            # adjusting intrinsics to match each scale in the pyramid
+
+            depth_gt = self.get_depth(self.dataset, folder, frame_index, side)
+            imgs["depth_gt"] = np.expand_dims(depth_gt, 0)
+        results['imgs'] = imgs
+
+        return results
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_sampler.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_sampler.py
new file mode 100644
index 0000000..12a8f76
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_sampler.py
@@ -0,0 +1,93 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import numpy as np
+from PIL import Image
+import decord as de
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class DecodeSampler(object):
+    """
+    We use 'decord' for decode and sampling, which is faster than opencv.
+    This is used in slowfast model.
+    Args:
+        num_frames(int): the number of frames we want to sample.
+        sampling_rate(int): sampling rate for video data.
+        target_fps(int): desired fps, default 30
+        test_mode(bool): whether test or train/valid. In slowfast, we use multicrop when test.
+    """
+    def __init__(self,
+                 num_frames,
+                 sampling_rate,
+                 default_sampling_rate=2,
+                 target_fps=30,
+                 test_mode=False):
+        self.num_frames = num_frames
+        self.orig_sampling_rate = self.sampling_rate = sampling_rate
+        self.default_sampling_rate = default_sampling_rate
+        self.target_fps = target_fps
+        self.test_mode = test_mode
+
+    def get_start_end_idx(self, video_size, clip_size, clip_idx,
+                          temporal_num_clips):
+        delta = max(video_size - clip_size, 0)
+        if not self.test_mode:
+            # Random temporal sampling.
+            start_idx = random.uniform(0, delta)
+        else:
+            # Uniformly sample the clip with the given index.
+            start_idx = delta * clip_idx / temporal_num_clips
+        end_idx = start_idx + clip_size - 1
+        return start_idx, end_idx
+
+    def __call__(self, results):
+        """
+        Perform mp4 decode operations.
+        return:
+            List where each item is a numpy array after decoder.
+        """
+        short_cycle_idx = results.get('short_cycle_idx')
+        if short_cycle_idx:
+            self.sampling_rate = random.randint(self.default_sampling_rate,
+                                                self.orig_sampling_rate)
+
+        filepath = results['filename']
+        temporal_sample_index = results['temporal_sample_index']
+        temporal_num_clips = results['temporal_num_clips']
+
+        vr = de.VideoReader(filepath)
+        videolen = len(vr)
+
+        # fps = vr.get_avg_fps()
+        clip_size = self.num_frames * self.sampling_rate * fps / self.target_fps
+
+        start_idx, end_idx = self.get_start_end_idx(videolen, clip_size,
+                                                    temporal_sample_index,
+                                                    temporal_num_clips)
+        index = np.linspace(start_idx, end_idx, self.num_frames).astype("int64")
+        index = np.clip(index, 0, videolen)
+
+        frames_select = vr.get_batch(index)  #1 for buffer
+
+        # dearray_to_img
+        np_frames = frames_select.asnumpy()
+        frames_select_list = []
+        for i in range(np_frames.shape[0]):
+            imgbuf = np_frames[i]
+            frames_select_list.append(Image.fromarray(imgbuf, mode='RGB'))
+        results['imgs'] = frames_select_list
+        return results
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_sampler_MRI.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_sampler_MRI.py
new file mode 100644
index 0000000..08d1dd0
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_sampler_MRI.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+
+import numpy as np
+from PIL import Image
+try:
+    import SimpleITK as sitk
+except ImportError as e:
+    print(
+        f"Warning! {e}, [SimpleITK] package and it's dependencies is required for PP-Care."
+    )
+import cv2
+
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class SFMRI_DecodeSampler(object):
+    """
+    Sample frames id.
+    NOTE: Use PIL to read image here, has diff with CV2
+    Args:
+        num_seg(int): number of segments.
+        seg_len(int): number of sampled frames in each segment.
+        valid_mode(bool): True or False.
+        select_left: Whether to select the frame to the left in the middle when the sampling interval is even in the test mode.
+    Returns:
+        frames_idx: the index of sampled #frames.
+    """
+    def __init__(self,
+                 num_seg,
+                 seg_len,
+                 valid_mode=False,
+                 select_left=False,
+                 dense_sample=False,
+                 linspace_sample=False):
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.valid_mode = valid_mode
+        self.select_left = select_left
+        self.dense_sample = dense_sample
+        self.linspace_sample = linspace_sample
+
+    def _get(self, frames_idx_s, frames_idx_f, results):
+
+        frame_dir = results['frame_dir']
+        imgs_s = []
+        imgs_f = []
+        MRI = sitk.GetArrayFromImage(sitk.ReadImage(frame_dir))
+        for idx in frames_idx_s:
+            item = MRI[idx]
+            item = cv2.resize(item, (224, 224))
+            imgs_s.append(item)
+
+        for idx in frames_idx_f:
+            item = MRI[idx]
+            item = cv2.resize(item, (224, 224))
+            imgs_f.append(item)
+
+        results['imgs'] = [imgs_s, imgs_f]
+        return results
+
+    def __call__(self, results):
+        """
+        Args:
+            frames_len: length of frames.
+        return:
+            sampling id.
+        """
+        frames_len = int(results['frames_len'])
+        average_dur1 = int(frames_len / self.num_seg[0])
+        average_dur2 = int(frames_len / self.num_seg[1])
+        frames_idx_s = []
+        frames_idx_f = []
+        if self.linspace_sample:
+            if 'start_idx' in results and 'end_idx' in results:
+                offsets_s = np.linspace(results['start_idx'],
+                                        results['end_idx'], self.num_seg[0])
+                offsets_f = np.linspace(results['start_idx'],
+                                        results['end_idx'], self.num_seg[1])
+            else:
+                offsets_s = np.linspace(0, frames_len - 1, self.num_seg[0])
+                offsets_f = np.linspace(0, frames_len - 1, self.num_seg[1])
+            offsets_s = np.clip(offsets_s, 0, frames_len - 1).astype(np.int64)
+            offsets_f = np.clip(offsets_f, 0, frames_len - 1).astype(np.int64)
+
+            frames_idx_s = list(offsets_s)
+            frames_idx_f = list(offsets_f)
+
+            return self._get(frames_idx_s, frames_idx_f, results)
+
+        if not self.select_left:
+            if self.dense_sample:  # For ppTSM
+                if not self.valid_mode:  # train
+                    sample_pos = max(1, 1 + frames_len - 64)
+                    t_stride1 = 64 // self.num_seg[0]
+                    t_stride2 = 64 // self.num_seg[1]
+                    start_idx = 0 if sample_pos == 1 else np.random.randint(
+                        0, sample_pos - 1)
+                    offsets_s = [(idx * t_stride1 + start_idx) % frames_len + 1
+                                 for idx in range(self.num_seg[0])]
+                    offsets_f = [(idx * t_stride2 + start_idx) % frames_len + 1
+                                 for idx in range(self.num_seg[1])]
+                    frames_idx_s = offsets_s
+                    frames_idx_f = offsets_f
+                else:
+                    sample_pos = max(1, 1 + frames_len - 64)
+                    t_stride1 = 64 // self.num_seg[0]
+                    t_stride2 = 64 // self.num_seg[1]
+                    start_list = np.linspace(0,
+                                             sample_pos - 1,
+                                             num=10,
+                                             dtype=int)
+                    offsets_s = []
+                    offsets_f = []
+                    for start_idx in start_list.tolist():
+                        offsets_s += [
+                            (idx * t_stride1 + start_idx) % frames_len + 1
+                            for idx in range(self.num_seg[0])
+                        ]
+                    for start_idx in start_list.tolist():
+                        offsets_f += [
+                            (idx * t_stride2 + start_idx) % frames_len + 1
+                            for idx in range(self.num_seg[1])
+                        ]
+                    frames_idx_s = offsets_s
+                    frames_idx_f = offsets_f
+            else:
+                for i in range(self.num_seg[0]):
+                    idx = 0
+                    if not self.valid_mode:
+                        if average_dur1 >= self.seg_len:
+                            idx = random.randint(0, average_dur1 - self.seg_len)
+                            idx += i * average_dur1
+                        elif average_dur1 >= 1:
+                            idx += i * average_dur1
+                        else:
+                            idx = i
+                    else:
+                        if average_dur1 >= self.seg_len:
+                            idx = (average_dur1 - 1) // 2
+                            idx += i * average_dur1
+                        elif average_dur1 >= 1:
+                            idx += i * average_dur1
+                        else:
+                            idx = i
+                    for jj in range(idx, idx + self.seg_len):
+                        frames_idx_s.append(jj)
+
+                for i in range(self.num_seg[1]):
+                    idx = 0
+                    if not self.valid_mode:
+                        if average_dur2 >= self.seg_len:
+                            idx = random.randint(0, average_dur2 - self.seg_len)
+                            idx += i * average_dur2
+                        elif average_dur2 >= 1:
+                            idx += i * average_dur2
+                        else:
+                            idx = i
+                    else:
+                        if average_dur2 >= self.seg_len:
+                            idx = (average_dur2 - 1) // 2
+                            idx += i * average_dur2
+                        elif average_dur2 >= 1:
+                            idx += i * average_dur2
+                        else:
+                            idx = i
+                    for jj in range(idx, idx + self.seg_len):
+                        frames_idx_f.append(jj)
+
+            return self._get(frames_idx_s, frames_idx_f, results)
+
+        else:  # for TSM
+            if not self.valid_mode:
+                if average_dur2 > 0:
+                    offsets_s = np.multiply(list(range(
+                        self.num_seg[0])), average_dur1) + np.random.randint(
+                            average_dur1, size=self.num_seg[0])
+
+                    offsets_f = np.multiply(list(range(
+                        self.num_seg[1])), average_dur2) + np.random.randint(
+                            average_dur2, size=self.num_seg[1])
+                elif frames_len > self.num_seg[1]:
+                    offsets_s = np.sort(
+                        np.random.randint(frames_len, size=self.num_seg[0]))
+                    offsets_f = np.sort(
+                        np.random.randint(frames_len, size=self.num_seg[1]))
+                else:
+                    offsets_s = np.zeros(shape=(self.num_seg[0], ))
+                    offsets_f = np.zeros(shape=(self.num_seg[1], ))
+            else:
+                if frames_len > self.num_seg[1]:
+                    average_dur_float_s = frames_len / self.num_seg[0]
+                    offsets_s = np.array([
+                        int(average_dur_float_s / 2.0 + average_dur_float_s * x)
+                        for x in range(self.num_seg[0])
+                    ])
+                    average_dur_float_f = frames_len / self.num_seg[1]
+                    offsets_f = np.array([
+                        int(average_dur_float_f / 2.0 + average_dur_float_f * x)
+                        for x in range(self.num_seg[1])
+                    ])
+                else:
+                    offsets_s = np.zeros(shape=(self.num_seg[0], ))
+                    offsets_f = np.zeros(shape=(self.num_seg[1], ))
+
+            frames_idx_s = list(offsets_s)
+            frames_idx_f = list(offsets_f)
+
+            return self._get(frames_idx_s, frames_idx_f, results)
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/mix.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/mix.py
new file mode 100644
index 0000000..ccc5f98
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/mix.py
@@ -0,0 +1,116 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class Mixup(object):
+    """
+    Mixup operator.
+    Args:
+        alpha(float): alpha value.
+    """
+    def __init__(self, alpha=0.2):
+        assert alpha > 0., \
+                'parameter alpha[%f] should > 0.0' % (alpha)
+        self.alpha = alpha
+
+    def __call__(self, batch):
+        imgs, labels = list(zip(*batch))
+        imgs = np.array(imgs)
+        labels = np.array(labels)
+        bs = len(batch)
+        idx = np.random.permutation(bs)
+        lam = np.random.beta(self.alpha, self.alpha)
+        lams = np.array([lam] * bs, dtype=np.float32)
+        imgs = lam * imgs + (1 - lam) * imgs[idx]
+        return list(zip(imgs, labels, labels[idx], lams))
+
+
+@PIPELINES.register()
+class Cutmix(object):
+    """ Cutmix operator
+    Args:
+        alpha(float): alpha value.
+    """
+    def __init__(self, alpha=0.2):
+        assert alpha > 0., \
+                'parameter alpha[%f] should > 0.0' % (alpha)
+        self.alpha = alpha
+
+    def rand_bbox(self, size, lam):
+        """ rand_bbox """
+        w = size[2]
+        h = size[3]
+        cut_rat = np.sqrt(1. - lam)
+        cut_w = np.int(w * cut_rat)
+        cut_h = np.int(h * cut_rat)
+
+        # uniform
+        cx = np.random.randint(w)
+        cy = np.random.randint(h)
+
+        bbx1 = np.clip(cx - cut_w // 2, 0, w)
+        bby1 = np.clip(cy - cut_h // 2, 0, h)
+        bbx2 = np.clip(cx + cut_w // 2, 0, w)
+        bby2 = np.clip(cy + cut_h // 2, 0, h)
+
+        return bbx1, bby1, bbx2, bby2
+
+    def __call__(self, batch):
+        imgs, labels = list(zip(*batch))
+        imgs = np.array(imgs)
+        labels = np.array(labels)
+
+        bs = len(batch)
+        idx = np.random.permutation(bs)
+        lam = np.random.beta(self.alpha, self.alpha)
+
+        bbx1, bby1, bbx2, bby2 = self.rand_bbox(imgs.shape, lam)
+        imgs[:, :, bbx1:bbx2, bby1:bby2] = imgs[idx, :, bbx1:bbx2, bby1:bby2]
+        lam = 1 - (float(bbx2 - bbx1) * (bby2 - bby1) /
+                   (imgs.shape[-2] * imgs.shape[-1]))
+        lams = np.array([lam] * bs, dtype=np.float32)
+
+        return list(zip(imgs, labels, labels[idx], lams))
+
+
+@PIPELINES.register()
+class VideoMix(object):
+    """
+    VideoMix operator.
+    Args:
+        cutmix_prob(float): prob choose cutmix
+        mixup_alpha(float): alpha for mixup aug
+        cutmix_alpha(float): alpha for cutmix aug
+    """
+    def __init__(self, cutmix_prob=0.5, mixup_alpha=0.2, cutmix_alpha=1.0):
+        assert cutmix_prob > 0., \
+                'parameter cutmix_prob[%f] should > 0.0' % (cutmix_prob)
+        assert mixup_alpha > 0., \
+                'parameter mixup_alpha[%f] should > 0.0' % (mixup_alpha)
+        assert cutmix_alpha > 0., \
+                'parameter cutmix_alpha[%f] should > 0.0' % (cutmix_alpha)
+        self.cutmix_prob = cutmix_prob
+        self.mixup = Mixup(mixup_alpha)
+        self.cutmix = Cutmix(cutmix_alpha)
+
+    def __call__(self, batch):
+        if np.random.random() < self.cutmix_prob:
+            return self.cutmix(batch)
+        else:
+            return self.mixup(batch)
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/multimodal.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/multimodal.py
new file mode 100644
index 0000000..39ced5d
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/multimodal.py
@@ -0,0 +1,380 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import numpy as np
+from PIL import Image
+# import decord as de
+import copy
+import json
+from ..registry import PIPELINES
+
+try:
+    from paddlenlp.transformers import BertTokenizer
+except ImportError as e:
+    print(
+        f"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT."
+    )
+
+
+@PIPELINES.register()
+class FeaturePadding(object):
+    """
+    Padding feature to target shape.
+    """
+    def __init__(self, max_region_num=36, max_action_num=5):
+        self.max_region_num = max_region_num
+        self.max_action_num = max_action_num
+
+    def __call__(self, results):
+        """
+        Padding feature.
+        """
+        pack_feature = results['feature']
+        tokenizer = results['tokenizer']
+        image_feature_wp, image_target_wp, image_location_wp, \
+                num_boxes,  image_h, image_w, image_id, caption, \
+                action_feature_wp, action_target_wp, num_actions = pack_feature
+
+        image_feature = np.zeros((self.max_region_num, 2048), dtype=np.float32)
+        image_target = np.zeros((self.max_region_num, 1601), dtype=np.float32)
+        image_location = np.zeros((self.max_region_num, 5), dtype=np.float32)
+
+        action_feature = np.zeros((self.max_action_num, 2048), dtype=np.float32)
+        action_target = np.zeros((self.max_action_num, ), dtype=np.int64)
+
+        num_boxes = int(num_boxes)
+        image_feature[:num_boxes] = image_feature_wp
+        image_target[:num_boxes] = image_target_wp
+        image_location[:num_boxes, :4] = image_location_wp
+
+        image_location[:, 4] = (image_location[:, 3] - image_location[:, 1]) * (
+            image_location[:, 2] - image_location[:, 0]) / (float(image_w) *
+                                                            float(image_h))
+
+        image_location[:, 0] = image_location[:, 0] / float(image_w)
+        image_location[:, 1] = image_location[:, 1] / float(image_h)
+        image_location[:, 2] = image_location[:, 2] / float(image_w)
+        image_location[:, 3] = image_location[:, 3] / float(image_h)
+
+        image_feature = copy.deepcopy(image_feature)
+        image_target = copy.deepcopy(image_target)
+
+        num_actions = int(num_actions)
+        action_feature[:num_actions] = action_feature_wp
+        action_target[:num_actions] = action_target_wp
+        action_feature = copy.deepcopy(action_feature)
+        action_target = copy.deepcopy(action_target)
+
+        results = dict(image_feat=image_feature,
+                       image_target=image_target,
+                       caption=caption,
+                       image_loc=image_location,
+                       num_boxes=int(num_boxes),
+                       action_feat=action_feature,
+                       action_target=action_target,
+                       num_actions=int(num_actions),
+                       tokenizer=tokenizer)
+        return results
+
+
+@PIPELINES.register()
+class RandomCap(object):
+    def __init__(self, caption_path):
+        """
+        Random Caption for NSP task
+        """
+        self.caption_path = caption_path
+
+    def select_caption(self, caption):
+        captions = caption.split('!')
+        rind = random.randint(0, len(captions) - 1)
+        caption = captions[rind]
+        return caption
+
+    def get_random_caption(self, all_captions):
+        num_caps = len(all_captions)
+        rand_doc_idx = random.randint(0, num_caps - 1)
+        caption = all_captions[rand_doc_idx]
+        caption = self.select_caption(caption)
+        return caption
+
+    def random_cap(self, caption, all_captions):
+        if random.random() > 0.5:
+            label = 0
+        else:
+            caption = self.get_random_caption(all_captions)
+            label = 1
+        return caption, label
+
+    def __call__(self, results):
+        caption = results['caption']
+        all_captions = list(json.load(open(self.caption_path, 'r')))
+        caption = self.select_caption(caption)
+        caption, label = self.random_cap(caption, all_captions)
+        results['caption'] = caption
+        results['is_next'] = label
+        return results
+
+
+@PIPELINES.register()
+class Tokenize(object):
+    def __init__(self, ):
+        """
+        Tokenize caption
+        """
+        pass
+
+    def __call__(self, results):
+        caption = results['caption']
+        tokenizer = results['tokenizer']
+        tokens_caption = tokenizer.tokenize(caption)
+        results['caption'] = tokens_caption
+        return results
+
+
+@PIPELINES.register()
+class RandomMask(object):
+    def __init__(self,
+                 max_seq_length=36,
+                 max_action_length=5,
+                 max_region_length=36):
+        self.max_seq_length = max_seq_length
+        self.max_action_length = max_action_length
+        self.max_region_length = max_region_length
+
+    def get_image_global_feature(self, image_feat, image_loc, image_mask):
+        g_image_feat = np.sum(image_feat, axis=0) / np.sum(
+            image_mask, axis=0, keepdims=True)
+        image_feat = np.concatenate(
+            [np.expand_dims(g_image_feat, axis=0), image_feat],
+            axis=0).astype("float32")
+
+        g_image_loc = np.array([0, 0, 1, 1, 1]).astype("float32")
+        image_loc = np.concatenate(
+            [np.expand_dims(g_image_loc, axis=0), image_loc], axis=0)
+
+        g_image_mask = np.array([1])
+        image_mask = np.concatenate([g_image_mask, image_mask], axis=0)
+
+        return image_feat, image_loc, image_mask
+
+    def _truncate_seq_pair(self, tokens_b, max_length):
+        """Truncates a sequence pair in place to the maximum length.
+        This is a simple heuristic which will always truncate the longer sequence
+        one token at a time. This makes more sense than truncating an equal percent
+        of tokens from each, since if one sequence is very short then each token
+        that's truncated likely contains more information than a longer sequence.
+        """
+        while True:
+            total_length = len(tokens_b)
+            if total_length <= max_length:
+                break
+            tokens_b.pop()
+
+    def random_word(self, tokens, tokenizer):
+        """
+        Masking some random tokens for Language Model task with probabilities as in the original BERT paper.
+        Args:
+            tokens: list of str, tokenized sentence.
+            tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)
+        Return:
+            (list of str, list of int), masked tokens and related labels for LM prediction
+        """
+        output_label = []
+
+        for i, token in enumerate(tokens):
+            prob = random.random()
+            # mask token with 15% probability
+
+            if prob < 0.15:
+                prob /= 0.15
+
+                # 80% randomly change token to mask token
+                if prob < 0.8:
+                    tokens[i] = "[MASK]"
+
+                # 10% randomly change token to random token
+                elif prob < 0.9:
+                    #tok = random.choice(list(tokenizer.vocab.items()))[0]
+                    tok = tokenizer.vocab.idx_to_token[random.randint(
+                        0,
+                        tokenizer.vocab_size,
+                    )]
+                    tokens[i] = tok
+
+                # rest 10% randomly keep current token
+                # append current token to output (we will predict these later)
+                try:
+                    output_label.append(tokenizer.vocab[token])
+                except KeyError:
+                    # For unknown words (should not occur with BPE vocab)
+                    output_label.append(tokenizer.vocab["[UNK]"])
+                    print(
+                        "Cannot find token '{}' in vocab. Using [UNK] insetad".
+                        format(token))
+            else:
+                # no masking token (will be ignored by loss function later)
+                output_label.append(-1)
+
+        return tokens, output_label
+
+    def random_region(self, image_feat, image_loc, num_boxes):
+        output_label = []
+
+        for i in range(num_boxes):
+            prob = random.random()
+            # mask token with 15% probability
+            if prob < 0.15:
+                prob /= 0.15
+
+                # 80% randomly change token to mask token
+                if prob < 0.9:
+                    image_feat[i] = 0
+
+                # rest 20% randomly keep current token
+                # append current token to output (we will predict these later)
+                output_label.append(1)
+            else:
+                # no masking token (will be ignored by loss function later)
+                output_label.append(-1)
+
+        return image_feat, image_loc, output_label
+
+    def random_action(self, action_feat, action_target, num_actions):
+        output_label = []
+
+        for i in range(num_actions):
+            prob = random.random()
+            # mask token with 15% probability
+            if prob < 0.15:
+                prob /= 0.15
+
+                # 90% randomly change token to mask token
+                if prob < 0.9:
+                    action_feat[i] = 0
+
+                # rest 10% randomly keep current token
+                # append current token to output (we will predict these later)
+                output_label.append(action_target[i])
+            else:
+                # no masking token (will be ignored by loss function later)
+                output_label.append(-1)
+
+        return action_feat, output_label
+
+    def __call__(self, results):
+        caption = results['caption']
+        tokenizer = results['tokenizer']
+        image_feat = results['image_feat']
+        image_loc = results['image_loc']
+        num_boxes = results['num_boxes']
+        action_feat = results['action_feat']
+        action_target = results['action_target']
+        num_actions = results['num_actions']
+        is_next = results['is_next']
+        image_target = results['image_target']
+
+        self._truncate_seq_pair(caption, self.max_seq_length - 2)
+        caption, caption_label = self.random_word(caption, tokenizer)
+
+        image_feat, image_loc, image_label = self.random_region(
+            image_feat, image_loc, num_boxes)
+        action_feat, action_label = self.random_action(action_feat,
+                                                       action_target,
+                                                       num_actions)
+
+        # concatenate lm labels and account for CLS, SEP, SEP
+        lm_label_ids = [-1] + caption_label + [-1]
+
+        # The convention in BERT is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids: 0   0   0   0  0     0 0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambigiously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+
+        tokens = []
+        segment_ids = []
+
+        tokens.append("[CLS]")
+        segment_ids.append(0)
+
+        for token in caption:
+            tokens.append(token)
+            segment_ids.append(0)
+        tokens.append("[SEP]")
+        segment_ids.append(0)
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
+        input_mask = [1] * (len(input_ids))
+        image_mask = [1] * (num_boxes)
+        action_mask = [1] * (num_actions)
+
+        # Zero-pad up to the visual sequence length.
+        while len(image_mask) < self.max_region_length:
+            image_mask.append(0)
+            image_label.append(-1)
+        while len(action_mask) < self.max_action_length:
+            action_mask.append(0)
+            action_label.append(-1)
+
+        # Zero-pad up to the sequence length.
+        while len(input_ids) < self.max_seq_length:
+            input_ids.append(0)
+            input_mask.append(0)
+            segment_ids.append(0)
+            lm_label_ids.append(-1)
+
+        assert len(input_ids) == self.max_seq_length
+        assert len(input_mask) == self.max_seq_length
+        assert len(segment_ids) == self.max_seq_length
+        assert len(lm_label_ids) == self.max_seq_length
+        assert len(image_mask) == self.max_region_length
+        assert len(image_label) == self.max_region_length
+        assert len(action_mask) == self.max_action_length
+        assert len(action_label) == self.max_action_length
+
+        image_feat, image_loc, image_mask = self.get_image_global_feature(
+            image_feat, image_loc, np.array(image_mask))
+        features = [
+            np.array(input_ids),
+            action_feat,
+            image_feat,
+            image_loc,
+            np.array(segment_ids),
+            np.array(input_mask),
+            image_mask,
+            np.array(action_mask),
+            np.array(lm_label_ids),
+            np.array(action_label),
+            np.array(is_next),
+            np.array(image_label),
+            image_target,
+        ]
+        results['features'] = features
+        return results
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample.py
new file mode 100644
index 0000000..0a1d068
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample.py
@@ -0,0 +1,382 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+
+import numpy as np
+from PIL import Image
+try:
+    import SimpleITK as sitk
+except ImportError as e:
+    print(
+        f"Warning! {e}, [SimpleITK] package and it's dependencies is required for PP-Care."
+    )
+import cv2
+
+from ..registry import PIPELINES
+
+try:
+    import cPickle as pickle
+    from cStringIO import StringIO
+except ImportError:
+    import pickle
+    from io import BytesIO
+
+
+@PIPELINES.register()
+class Sampler(object):
+    """
+    Sample frames id.
+    NOTE: Use PIL to read image here, has diff with CV2
+    Args:
+        num_seg(int): number of segments.
+        seg_len(int): number of sampled frames in each segment.
+        valid_mode(bool): True or False.
+        select_left: Whether to select the frame to the left in the middle when the sampling interval is even in the test mode.
+    Returns:
+        frames_idx: the index of sampled #frames.
+    """
+    def __init__(self,
+                 num_seg,
+                 seg_len,
+                 frame_interval=None,
+                 valid_mode=False,
+                 select_left=False,
+                 dense_sample=False,
+                 linspace_sample=False,
+                 use_pil=True):
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.frame_interval = frame_interval
+        self.valid_mode = valid_mode
+        self.select_left = select_left
+        self.dense_sample = dense_sample
+        self.linspace_sample = linspace_sample
+        self.use_pil = use_pil
+
+    def _get(self, frames_idx, results):
+        data_format = results['format']
+
+        if data_format == "frame":
+            frame_dir = results['frame_dir']
+            imgs = []
+            for idx in frames_idx:
+                img = Image.open(
+                    os.path.join(frame_dir,
+                                 results['suffix'].format(idx))).convert('RGB')
+                imgs.append(img)
+
+        elif data_format == "MRI":
+            frame_dir = results['frame_dir']
+            imgs = []
+            MRI = sitk.GetArrayFromImage(sitk.ReadImage(frame_dir))
+            for idx in frames_idx:
+                item = MRI[idx]
+                item = cv2.resize(item, (224, 224))
+                imgs.append(item)
+
+        elif data_format == "video":
+            if results['backend'] == 'cv2':
+                frames = np.array(results['frames'])
+                imgs = []
+                for idx in frames_idx:
+                    imgbuf = frames[idx]
+                    img = Image.fromarray(imgbuf, mode='RGB')
+                    imgs.append(img)
+            elif results['backend'] == 'decord':
+                container = results['frames']
+                if self.use_pil:
+                    frames_select = container.get_batch(frames_idx)
+                    # dearray_to_img
+                    np_frames = frames_select.asnumpy()
+                    imgs = []
+                    for i in range(np_frames.shape[0]):
+                        imgbuf = np_frames[i]
+                        imgs.append(Image.fromarray(imgbuf, mode='RGB'))
+                else:
+                    if frames_idx.ndim != 1:
+                        frames_idx = np.squeeze(frames_idx)
+                    frame_dict = {
+                        idx: container[idx].asnumpy()
+                        for idx in np.unique(frames_idx)
+                    }
+                    imgs = [frame_dict[idx] for idx in frames_idx]
+            elif results['backend'] == 'pyav':
+                imgs = []
+                frames = np.array(results['frames'])
+                for idx in frames_idx:
+                    if self.dense_sample:
+                        idx = idx - 1
+                    imgbuf = frames[idx]
+                    imgs.append(imgbuf)
+                imgs = np.stack(imgs)  # thwc
+            else:
+                raise NotImplementedError
+        else:
+            raise NotImplementedError
+        results['imgs'] = imgs
+        return results
+
+    def _get_train_clips(self, num_frames):
+        ori_seg_len = self.seg_len * self.frame_interval
+        avg_interval = (num_frames - ori_seg_len + 1) // self.num_seg
+
+        if avg_interval > 0:
+            base_offsets = np.arange(self.num_seg) * avg_interval
+            clip_offsets = base_offsets + np.random.randint(avg_interval,
+                                                            size=self.num_seg)
+        elif num_frames > max(self.num_seg, ori_seg_len):
+            clip_offsets = np.sort(
+                np.random.randint(num_frames - ori_seg_len + 1,
+                                  size=self.num_seg))
+        elif avg_interval == 0:
+            ratio = (num_frames - ori_seg_len + 1.0) / self.num_seg
+            clip_offsets = np.around(np.arange(self.num_seg) * ratio)
+        else:
+            clip_offsets = np.zeros((self.num_seg, ), dtype=np.int)
+        return clip_offsets
+
+    def _get_test_clips(self, num_frames):
+        ori_seg_len = self.seg_len * self.frame_interval
+        avg_interval = (num_frames - ori_seg_len + 1) / float(self.num_seg)
+        if num_frames > ori_seg_len - 1:
+            base_offsets = np.arange(self.num_seg) * avg_interval
+            clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int)
+        else:
+            clip_offsets = np.zeros((self.num_seg, ), dtype=np.int)
+        return clip_offsets
+
+    def __call__(self, results):
+        """
+        Args:
+            frames_len: length of frames.
+        return:
+            sampling id.
+        """
+        frames_len = int(results['frames_len'])
+        frames_idx = []
+        if self.frame_interval is not None:
+            assert isinstance(self.frame_interval, int)
+            if not self.valid_mode:
+                offsets = self._get_train_clips(frames_len)
+            else:
+                offsets = self._get_test_clips(frames_len)
+
+            offsets = offsets[:, None] + np.arange(
+                self.seg_len)[None, :] * self.frame_interval
+            offsets = np.concatenate(offsets)
+
+            offsets = offsets.reshape((-1, self.seg_len))
+            offsets = np.mod(offsets, frames_len)
+            offsets = np.concatenate(offsets)
+
+            if results['format'] == 'video':
+                frames_idx = offsets
+            elif results['format'] == 'frame':
+                frames_idx = list(offsets + 1)
+            else:
+                raise NotImplementedError
+
+            return self._get(frames_idx, results)
+
+        if self.linspace_sample:
+            if 'start_idx' in results and 'end_idx' in results:
+                offsets = np.linspace(results['start_idx'], results['end_idx'],
+                                      self.num_seg)
+            else:
+                offsets = np.linspace(0, frames_len - 1, self.num_seg)
+            offsets = np.clip(offsets, 0, frames_len - 1).astype(np.int64)
+            if results['format'] == 'video':
+                frames_idx = list(offsets)
+                frames_idx = [x % frames_len for x in frames_idx]
+            elif results['format'] == 'frame':
+                frames_idx = list(offsets + 1)
+
+            elif results['format'] == 'MRI':
+                frames_idx = list(offsets)
+
+            else:
+                raise NotImplementedError
+            return self._get(frames_idx, results)
+
+        average_dur = int(frames_len / self.num_seg)
+        if not self.select_left:
+            if self.dense_sample:  # For ppTSM
+                if not self.valid_mode:  # train
+                    sample_pos = max(1, 1 + frames_len - 64)
+                    t_stride = 64 // self.num_seg
+                    start_idx = 0 if sample_pos == 1 else np.random.randint(
+                        0, sample_pos - 1)
+                    offsets = [(idx * t_stride + start_idx) % frames_len + 1
+                               for idx in range(self.num_seg)]
+                    frames_idx = offsets
+                else:
+                    sample_pos = max(1, 1 + frames_len - 64)
+                    t_stride = 64 // self.num_seg
+                    start_list = np.linspace(0,
+                                             sample_pos - 1,
+                                             num=10,
+                                             dtype=int)
+                    offsets = []
+                    for start_idx in start_list.tolist():
+                        offsets += [
+                            (idx * t_stride + start_idx) % frames_len + 1
+                            for idx in range(self.num_seg)
+                        ]
+                    frames_idx = offsets
+            else:
+                for i in range(self.num_seg):
+                    idx = 0
+                    if not self.valid_mode:
+                        if average_dur >= self.seg_len:
+                            idx = random.randint(0, average_dur - self.seg_len)
+                            idx += i * average_dur
+                        elif average_dur >= 1:
+                            idx += i * average_dur
+                        else:
+                            idx = i
+                    else:
+                        if average_dur >= self.seg_len:
+                            idx = (average_dur - 1) // 2
+                            idx += i * average_dur
+                        elif average_dur >= 1:
+                            idx += i * average_dur
+                        else:
+                            idx = i
+                    for jj in range(idx, idx + self.seg_len):
+                        if results['format'] == 'video':
+                            frames_idx.append(int(jj % frames_len))
+                        elif results['format'] == 'frame':
+                            frames_idx.append(jj + 1)
+
+                        elif results['format'] == 'MRI':
+                            frames_idx.append(jj)
+                        else:
+                            raise NotImplementedError
+            return self._get(frames_idx, results)
+
+        else:  # for TSM
+            if not self.valid_mode:
+                if average_dur > 0:
+                    offsets = np.multiply(list(range(self.num_seg)),
+                                          average_dur) + np.random.randint(
+                                              average_dur, size=self.num_seg)
+                elif frames_len > self.num_seg:
+                    offsets = np.sort(
+                        np.random.randint(frames_len, size=self.num_seg))
+                else:
+                    offsets = np.zeros(shape=(self.num_seg, ))
+            else:
+                if frames_len > self.num_seg:
+                    average_dur_float = frames_len / self.num_seg
+                    offsets = np.array([
+                        int(average_dur_float / 2.0 + average_dur_float * x)
+                        for x in range(self.num_seg)
+                    ])
+                else:
+                    offsets = np.zeros(shape=(self.num_seg, ))
+
+            if results['format'] == 'video':
+                frames_idx = list(offsets)
+                frames_idx = [x % frames_len for x in frames_idx]
+            elif results['format'] == 'frame':
+                frames_idx = list(offsets + 1)
+
+            elif results['format'] == 'MRI':
+                frames_idx = list(offsets)
+
+            else:
+                raise NotImplementedError
+
+            return self._get(frames_idx, results)
+
+
+@PIPELINES.register()
+class SamplerPkl(object):
+    """
+    Sample frames id.
+    NOTE: Use PIL to read image here, has diff with CV2
+    Args:
+        num_seg(int): number of segments.
+        seg_len(int): number of sampled frames in each segment.
+        mode(str): 'train', 'valid'
+    Returns:
+        frames_idx: the index of sampled #frames.
+    """
+    def __init__(self, num_seg, seg_len, backend='pillow', valid_mode=False):
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.valid_mode = valid_mode
+        self.backend = backend
+
+    def _get(self, buf):
+        if isinstance(buf, str):
+            img = Image.open(StringIO(buf))
+        else:
+            img = Image.open(BytesIO(buf))
+        img = img.convert('RGB')
+        if self.backend != 'pillow':
+            img = np.array(img)
+        return img
+
+    def __call__(self, results):
+        """
+        Args:
+            frames_len: length of frames.
+        return:
+            sampling id.
+        """
+        filename = results['frame_dir']
+        data_loaded = pickle.load(open(filename, 'rb'), encoding='bytes')
+        video_name, label, frames = data_loaded
+        if isinstance(label, dict):
+            label = label['动作类型']
+            results['labels'] = label
+        elif len(label) == 1:
+            results['labels'] = int(label[0])
+        else:
+            results['labels'] = int(label[0]) if random.random() < 0.5 else int(
+                label[1])
+        results['frames_len'] = len(frames)
+        frames_len = results['frames_len']
+        average_dur = int(int(frames_len) / self.num_seg)
+        imgs = []
+        for i in range(self.num_seg):
+            idx = 0
+            if not self.valid_mode:
+                if average_dur >= self.seg_len:
+                    idx = random.randint(0, average_dur - self.seg_len)
+                    idx += i * average_dur
+                elif average_dur >= 1:
+                    idx += i * average_dur
+                else:
+                    idx = i
+            else:
+                if average_dur >= self.seg_len:
+                    idx = (average_dur - 1) // 2
+                    idx += i * average_dur
+                elif average_dur >= 1:
+                    idx += i * average_dur
+                else:
+                    idx = i
+
+            for jj in range(idx, idx + self.seg_len):
+                imgbuf = frames[int(jj % results['frames_len'])]
+                img = self._get(imgbuf)
+                imgs.append(img)
+        results['backend'] = self.backend
+        results['imgs'] = imgs
+
+        return results
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample_ava.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample_ava.py
new file mode 100644
index 0000000..39e90a2
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample_ava.py
@@ -0,0 +1,375 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+from PIL import Image
+from ..registry import PIPELINES
+import os
+import numpy as np
+import io
+import os.path as osp
+from abc import ABCMeta, abstractmethod
+import cv2
+from cv2 import IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_UNCHANGED
+import inspect
+
+imread_backend = 'cv2'
+imread_flags = {
+    'color': IMREAD_COLOR,
+    'grayscale': IMREAD_GRAYSCALE,
+    'unchanged': IMREAD_UNCHANGED
+}
+
+
+@PIPELINES.register()
+class SampleFrames:
+    """Sample frames from the video. """
+
+    def __init__(self,
+                 clip_len,
+                 frame_interval=1,
+                 num_clips=1,
+                 temporal_jitter=False,
+                 twice_sample=False,
+                 out_of_bound_opt='loop',
+                 test_mode=False):
+        self.clip_len = clip_len
+        self.frame_interval = frame_interval
+        self.num_clips = num_clips
+        self.temporal_jitter = temporal_jitter
+        self.twice_sample = twice_sample
+        self.out_of_bound_opt = out_of_bound_opt
+        self.test_mode = test_mode
+        assert self.out_of_bound_opt in ['loop', 'repeat_last']
+
+    def _get_train_clips(self, num_frames):
+        """Get clip offsets in train mode. """
+        ori_clip_len = self.clip_len * self.frame_interval
+        avg_interval = (num_frames - ori_clip_len + 1) // self.num_clips
+        if avg_interval > 0:
+            base_offsets = np.arange(self.num_clips) * avg_interval
+            clip_offsets = base_offsets + np.random.randint(
+                avg_interval, size=self.num_clips)
+        elif num_frames > max(self.num_clips, ori_clip_len):
+            clip_offsets = np.sort(
+                np.random.randint(
+                    num_frames - ori_clip_len + 1, size=self.num_clips))
+        elif avg_interval == 0:
+            ratio = (num_frames - ori_clip_len + 1.0) / self.num_clips
+            clip_offsets = np.around(np.arange(self.num_clips) * ratio)
+        else:
+            clip_offsets = np.zeros((self.num_clips, ), dtype=np.int)
+        return clip_offsets
+
+    def _get_test_clips(self, num_frames):
+        """Get clip offsets in test mode. """
+        ori_clip_len = self.clip_len * self.frame_interval
+        avg_interval = (num_frames - ori_clip_len + 1) / float(self.num_clips)
+        if num_frames > ori_clip_len - 1:
+            base_offsets = np.arange(self.num_clips) * avg_interval
+            clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int)
+            if self.twice_sample:
+                clip_offsets = np.concatenate([clip_offsets, base_offsets])
+        else:
+            clip_offsets = np.zeros((self.num_clips, ), dtype=np.int)
+        return clip_offsets
+
+    def _sample_clips(self, num_frames):
+        """Choose clip offsets for the video in a given mode. """
+        if self.test_mode:
+            clip_offsets = self._get_test_clips(num_frames)
+        else:
+            clip_offsets = self._get_train_clips(num_frames)
+        return clip_offsets
+
+    def __call__(self, results):
+        """Perform the SampleFrames loading. """
+        total_frames = results['total_frames']
+        clip_offsets = self._sample_clips(total_frames)
+        frame_inds = clip_offsets[:, None] + np.arange(
+            self.clip_len)[None, :] * self.frame_interval
+        frame_inds = np.concatenate(frame_inds)
+        if self.temporal_jitter:
+            perframe_offsets = np.random.randint(
+                self.frame_interval, size=len(frame_inds))
+            frame_inds += perframe_offsets
+        frame_inds = frame_inds.reshape((-1, self.clip_len))
+        if self.out_of_bound_opt == 'loop':
+            frame_inds = np.mod(frame_inds, total_frames)
+        elif self.out_of_bound_opt == 'repeat_last':
+            safe_inds = frame_inds < total_frames
+            unsafe_inds = 1 - safe_inds
+            last_ind = np.max(safe_inds * frame_inds, axis=1)
+            new_inds = (safe_inds * frame_inds + (unsafe_inds.T * last_ind).T)
+            frame_inds = new_inds
+        else:
+            raise ValueError('Illegal out_of_bound option.')
+        start_index = results['start_index']
+        frame_inds = np.concatenate(frame_inds) + start_index
+        results['frame_inds'] = frame_inds.astype(np.int)
+        results['clip_len'] = self.clip_len
+        results['frame_interval'] = self.frame_interval
+        results['num_clips'] = self.num_clips
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'clip_len={self.clip_len}, '
+                    f'frame_interval={self.frame_interval}, '
+                    f'num_clips={self.num_clips}, '
+                    f'temporal_jitter={self.temporal_jitter}, '
+                    f'twice_sample={self.twice_sample}, '
+                    f'out_of_bound_opt={self.out_of_bound_opt}, '
+                    f'test_mode={self.test_mode})')
+        return repr_str
+
+class BaseStorageBackend(metaclass=ABCMeta):
+    """Abstract class of storage backends. """
+
+    @abstractmethod
+    def get(self, filepath):
+        pass
+
+    @abstractmethod
+    def get_text(self, filepath):
+        pass
+
+class HardDiskBackend(BaseStorageBackend):
+    """Raw hard disks storage backend."""
+
+    def get(self, filepath):
+        filepath = str(filepath)
+        with open(filepath, 'rb') as f:
+            value_buf = f.read()
+        return value_buf
+
+    def get_text(self, filepath):
+        filepath = str(filepath)
+        with open(filepath, 'r') as f:
+            value_buf = f.read()
+        return value_buf
+
+class FileClient:
+    """A general file client to access files in different backend. """
+
+    _backends = {
+        'disk': HardDiskBackend,
+    }
+
+    def __init__(self, backend='disk', **kwargs):
+        if backend not in self._backends:
+            raise ValueError(
+                f'Backend {backend} is not supported. Currently supported ones'
+                f' are {list(self._backends.keys())}')
+        self.backend = backend
+        self.client = self._backends[backend](**kwargs)
+
+    @classmethod
+    def _register_backend(cls, name, backend, force=False):
+        if not isinstance(name, str):
+            raise TypeError('the backend name should be a string, '
+                            f'but got {type(name)}')
+        if not inspect.isclass(backend):
+            raise TypeError(
+                f'backend should be a class but got {type(backend)}')
+        if not issubclass(backend, BaseStorageBackend):
+            raise TypeError(
+                f'backend {backend} is not a subclass of BaseStorageBackend')
+        if not force and name in cls._backends:
+            raise KeyError(
+                f'{name} is already registered as a storage backend, '
+                'add "force=True" if you want to override it')
+
+        cls._backends[name] = backend
+
+    @classmethod
+    def register_backend(cls, name, backend=None, force=False):
+        """Register a backend to FileClient. """
+
+        if backend is not None:
+            cls._register_backend(name, backend, force=force)
+            return
+
+        def _register(backend_cls):
+            cls._register_backend(name, backend_cls, force=force)
+            return backend_cls
+
+        return _register
+
+    def get(self, filepath):
+        return self.client.get(filepath)
+
+    def get_text(self, filepath):
+        return self.client.get_text(filepath)
+
+@PIPELINES.register()
+class RawFrameDecode:
+    """Load and decode frames with given indices. """
+
+    def __init__(self, io_backend='disk', decoding_backend='cv2', **kwargs):
+        self.io_backend = io_backend
+        self.decoding_backend = decoding_backend
+        self.kwargs = kwargs
+        self.file_client = None
+
+    def _pillow2array(self,img, flag='color', channel_order='bgr'):
+        """Convert a pillow image to numpy array. """
+
+        channel_order = channel_order.lower()
+        if channel_order not in ['rgb', 'bgr']:
+            raise ValueError('channel order must be either "rgb" or "bgr"')
+
+        if flag == 'unchanged':
+            array = np.array(img)
+            if array.ndim >= 3 and array.shape[2] >= 3:  # color image
+                array[:, :, :3] = array[:, :, (2, 1, 0)]  # RGB to BGR
+        else:
+            # If the image mode is not 'RGB', convert it to 'RGB' first.
+            if img.mode != 'RGB':
+                if img.mode != 'LA':
+                    # Most formats except 'LA' can be directly converted to RGB
+                    img = img.convert('RGB')
+                else:
+                    # When the mode is 'LA', the default conversion will fill in
+                    #  the canvas with black, which sometimes shadows black objects
+                    #  in the foreground.
+                    #
+                    # Therefore, a random color (124, 117, 104) is used for canvas
+                    img_rgba = img.convert('RGBA')
+                    img = Image.new('RGB', img_rgba.size, (124, 117, 104))
+                    img.paste(img_rgba, mask=img_rgba.split()[3])  # 3 is alpha
+            if flag == 'color':
+                array = np.array(img)
+                if channel_order != 'rgb':
+                    array = array[:, :, ::-1]  # RGB to BGR
+            elif flag == 'grayscale':
+                img = img.convert('L')
+                array = np.array(img)
+            else:
+                raise ValueError(
+                    'flag must be "color", "grayscale" or "unchanged", '
+                    f'but got {flag}')
+        return array
+
+    def _imfrombytes(self,content, flag='color', channel_order='bgr'):#, backend=None):
+        """Read an image from bytes. """
+
+        img_np = np.frombuffer(content, np.uint8)
+        flag = imread_flags[flag] if isinstance(flag, str) else flag
+        img = cv2.imdecode(img_np, flag)
+        if flag == IMREAD_COLOR and channel_order == 'rgb':
+            cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+        return img
+
+    def __call__(self, results):
+        """Perform the ``RawFrameDecode`` to pick frames given indices.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        # mmcv.use_backend(self.decoding_backend)
+
+        directory = results['frame_dir']
+        suffix = results['suffix']
+        #modality = results['modality']
+
+        if self.file_client is None:
+            self.file_client = FileClient(self.io_backend, **self.kwargs)
+
+        imgs = list()
+
+        if results['frame_inds'].ndim != 1:
+            results['frame_inds'] = np.squeeze(results['frame_inds'])
+
+        offset = results.get('offset', 0)
+
+        for frame_idx in results['frame_inds']:
+            frame_idx += offset
+            filepath = osp.join(directory, suffix.format(frame_idx))
+            img_bytes = self.file_client.get(filepath) #以二进制方式读取图片
+            # Get frame with channel order RGB directly.
+
+            cur_frame = self._imfrombytes(img_bytes, channel_order='rgb')
+            imgs.append(cur_frame)
+
+        results['imgs'] = imgs
+        results['original_shape'] = imgs[0].shape[:2]
+        results['img_shape'] = imgs[0].shape[:2]
+
+        # we resize the gt_bboxes and proposals to their real scale
+        h, w = results['img_shape']
+        scale_factor = np.array([w, h, w, h])
+        if 'gt_bboxes' in results:
+            gt_bboxes = results['gt_bboxes']
+            gt_bboxes_new = (gt_bboxes * scale_factor).astype(np.float32)
+            results['gt_bboxes'] = gt_bboxes_new
+        if 'proposals' in results and results['proposals'] is not None:
+            proposals = results['proposals']
+            proposals = (proposals * scale_factor).astype(np.float32)
+            results['proposals'] = proposals
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'io_backend={self.io_backend}, '
+                    f'decoding_backend={self.decoding_backend})')
+        return repr_str
+
+@PIPELINES.register()
+class SampleAVAFrames(SampleFrames):
+
+    def __init__(self, clip_len, frame_interval=2, test_mode=False):
+
+        super().__init__(clip_len, frame_interval, test_mode=test_mode)
+
+    def _get_clips(self, center_index, skip_offsets, shot_info):
+        start = center_index - (self.clip_len // 2) * self.frame_interval
+        end = center_index + ((self.clip_len + 1) // 2) * self.frame_interval
+        frame_inds = list(range(start, end, self.frame_interval))
+        frame_inds = frame_inds + skip_offsets
+        frame_inds = np.clip(frame_inds, shot_info[0], shot_info[1] - 1)
+
+        return frame_inds
+
+    def __call__(self, results):
+        fps = results['fps']
+        timestamp = results['timestamp']
+        timestamp_start = results['timestamp_start']
+        shot_info = results['shot_info']
+
+        #delta=(timestamp - timestamp_start) 为该帧距离15min视频开头有几秒
+        #center_index=fps*delta为该帧距离15min视频开头有几帧
+        #center_index+1是为了避免后续采样时出现负数? 
+        #后续需要以center_index为中心前后采样视频帧片段
+        center_index = fps * (timestamp - timestamp_start) + 1
+
+        skip_offsets = np.random.randint(
+            -self.frame_interval // 2, (self.frame_interval + 1) // 2,
+            size=self.clip_len)
+        frame_inds = self._get_clips(center_index, skip_offsets, shot_info)
+
+        results['frame_inds'] = np.array(frame_inds, dtype=np.int)
+        results['clip_len'] = self.clip_len
+        results['frame_interval'] = self.frame_interval
+        results['num_clips'] = 1
+        results['crop_quadruple'] = np.array([0, 0, 1, 1], dtype=np.float32)
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'clip_len={self.clip_len}, '
+                    f'frame_interval={self.frame_interval}, '
+                    f'test_mode={self.test_mode})')
+        return repr_str
+
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample_ucf24.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample_ucf24.py
new file mode 100644
index 0000000..7d9e904
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample_ucf24.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+
+from PIL import Image
+
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class SamplerUCF24(object):
+    """
+    Sample frames id.
+    NOTE: Use PIL to read image here, has diff with CV2
+    Args:
+        num_frames(int): The amount of frames used in a video
+        frame_interval(int): Sampling rate
+        valid_mode(bool): True or False.
+    Returns:
+        frames_idx: the index of sampled #frames.
+    """
+    def __init__(self,
+                 num_frames=16,
+                 frame_interval=1,
+                 valid_mode=False):
+        self.num_frames = num_frames
+        self.frame_interval = frame_interval if valid_mode else random.randint(1, 2)
+        self.valid_mode = valid_mode
+
+    def _get(self, frames_idxs, img_folder, results):
+        imgs = []
+        for idx in frames_idxs:
+            img = Image.open(
+                os.path.join(img_folder, '{:05d}.jpg'.format(idx))).convert('RGB')
+            imgs.append(img)
+        results['imgs'] = imgs
+        return results
+
+    def _make_clip(self, im_ind, max_num):
+        frame_idxs = []
+        for i in reversed(range(self.num_frames)):
+            # make it as a loop
+            i_temp = im_ind - i * self.frame_interval
+            if i_temp < 1:
+                i_temp = 1
+            elif i_temp > max_num:
+                i_temp = max_num
+            frame_idxs.append(i_temp)
+        return frame_idxs
+
+    def __call__(self, results):
+        img_folder, key_frame = os.path.split(results['filename'])
+        frame_len = len(os.listdir(img_folder))
+        key_idx = int(key_frame[0:5])
+        frame_idxs = self._make_clip(key_idx, frame_len)
+        return self._get(frame_idxs, img_folder, results)
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/segmentation.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/segmentation.py
new file mode 100644
index 0000000..2471442
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/segmentation.py
@@ -0,0 +1,130 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from PIL import Image
+import copy
+import cv2
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class MultiRestrictSize(object):
+    def __init__(self,
+                 min_size=None,
+                 max_size=800,
+                 flip=False,
+                 multi_scale=[1.3]):
+        self.min_size = min_size
+        self.max_size = max_size
+        self.multi_scale = multi_scale
+        self.flip = flip
+        assert ((min_size is None)) or ((max_size is None))
+
+    def __call__(self, sample):
+        samples = []
+        image = sample['current_img']
+        h, w = image.shape[:2]
+        for scale in self.multi_scale:
+            # Fixed range of scales
+            sc = None
+            # Align short edge
+            if not (self.min_size is None):
+                if h > w:
+                    short_edge = w
+                else:
+                    short_edge = h
+                if short_edge > self.min_size:
+                    sc = float(self.min_size) / short_edge
+            else:
+                if h > w:
+                    long_edge = h
+                else:
+                    long_edge = w
+                if long_edge > self.max_size:
+                    sc = float(self.max_size) / long_edge
+
+            if sc is None:
+                new_h = h
+                new_w = w
+            else:
+                new_h = sc * h
+                new_w = sc * w
+            new_h = int(new_h * scale)
+            new_w = int(new_w * scale)
+
+            if (new_h - 1) % 16 != 0:
+                new_h = int(np.around((new_h - 1) / 16.) * 16 + 1)
+            if (new_w - 1) % 16 != 0:
+                new_w = int(np.around((new_w - 1) / 16.) * 16 + 1)
+
+            if new_h == h and new_w == w:
+                samples.append(sample)
+            else:
+                new_sample = {}
+                for elem in sample.keys():
+                    if 'meta' in elem:
+                        new_sample[elem] = sample[elem]
+                        continue
+                    tmp = sample[elem]
+                    if 'label' in elem:
+                        new_sample[elem] = sample[elem]
+                        continue
+                    else:
+                        flagval = cv2.INTER_CUBIC
+                        tmp = cv2.resize(tmp,
+                                         dsize=(new_w, new_h),
+                                         interpolation=flagval)
+                        new_sample[elem] = tmp
+                samples.append(new_sample)
+
+            if self.flip:
+                now_sample = samples[-1]
+                new_sample = {}
+                for elem in now_sample.keys():
+                    if 'meta' in elem:
+                        new_sample[elem] = now_sample[elem].copy()
+                        new_sample[elem]['flip'] = True
+                        continue
+                    tmp = now_sample[elem]
+                    tmp = tmp[:, ::-1].copy()
+                    new_sample[elem] = tmp
+                samples.append(new_sample)
+
+        return samples
+
+
+@PIPELINES.register()
+class MultiNorm(object):
+    def __call__(self, samples):
+        for idx in range(len(samples)):
+            sample = samples[idx]
+            for elem in sample.keys():
+                if 'meta' in elem:
+                    continue
+                tmp = sample[elem]
+                if tmp is None:
+                    continue
+
+                if tmp.ndim == 2:
+                    tmp = tmp[:, :, np.newaxis]
+                else:
+                    tmp = tmp / 255.
+                    tmp -= (0.485, 0.456, 0.406)
+                    tmp /= (0.229, 0.224, 0.225)
+
+                tmp = tmp.transpose((2, 0, 1))
+                samples[idx][elem] = tmp
+
+        return samples
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/segmentation_pipline.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/segmentation_pipline.py
new file mode 100644
index 0000000..dda6dee
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/segmentation_pipline.py
@@ -0,0 +1,40 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+
+import os
+import numpy as np
+import random
+import paddle
+from ..registry import PIPELINES
+"""
+pipeline ops for Action Segmentation Dataset.
+"""
+
+
+@PIPELINES.register()
+class SegmentationSampler(object):
+
+    def __init__(self, sample_rate):
+        self.sample_rate = sample_rate
+
+    def __call__(self, results):
+        for key, data in results.items():
+            if len(data.shape) == 1:
+                data = data[::self.sample_rate]
+                results[key] = copy.deepcopy(data)
+            else:
+                data = data[:, ::self.sample_rate]
+                results[key] = copy.deepcopy(data)
+        return results
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/skeleton_pipeline.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/skeleton_pipeline.py
new file mode 100644
index 0000000..d31c816
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/skeleton_pipeline.py
@@ -0,0 +1,1554 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import collections
+from itertools import repeat
+import copy as cp
+from collections import abc
+import numpy as np
+import paddle.nn.functional as F
+import random
+import paddle
+from ..registry import PIPELINES
+from .augmentations_ava import iminvert, imflip_
+"""pipeline ops for Activity Net.
+"""
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return tuple(x)
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+_single = _ntuple(1)
+_pair = _ntuple(2)
+_triple = _ntuple(3)
+_quadruple = _ntuple(4)
+
+
+def _init_lazy_if_proper(results, lazy):
+    """Initialize lazy operation properly.
+
+    Make sure that a lazy operation is properly initialized,
+    and avoid a non-lazy operation accidentally getting mixed in.
+
+    Required keys in results are "imgs" if "img_shape" not in results,
+    otherwise, Required keys in results are "img_shape", add or modified keys
+    are "img_shape", "lazy".
+    Add or modified keys in "lazy" are "original_shape", "crop_bbox", "flip",
+    "flip_direction", "interpolation".
+
+    Args:
+        results (dict): A dict stores data pipeline result.
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+
+    if 'img_shape' not in results:
+        results['img_shape'] = results['imgs'][0].shape[:2]
+    if lazy:
+        if 'lazy' not in results:
+            img_h, img_w = results['img_shape']
+            lazyop = dict()
+            lazyop['original_shape'] = results['img_shape']
+            lazyop['crop_bbox'] = np.array([0, 0, img_w, img_h],
+                                           dtype=np.float32)
+            lazyop['flip'] = False
+            lazyop['flip_direction'] = None
+            lazyop['interpolation'] = None
+            results['lazy'] = lazyop
+    else:
+        assert 'lazy' not in results, 'Use Fuse after lazy operations'
+
+
+@PIPELINES.register()
+class AutoPadding(object):
+    """
+    Sample or Padding frame skeleton feature.
+    Args:
+        window_size: int, temporal size of skeleton feature.
+        random_pad: bool, whether do random padding when frame length < window size. Default: False.
+    """
+
+    def __init__(self, window_size, random_pad=False):
+        self.window_size = window_size
+        self.random_pad = random_pad
+
+    def get_frame_num(self, data):
+        C, T, V, M = data.shape
+        for i in range(T - 1, -1, -1):
+            tmp = np.sum(data[:, i, :, :])
+            if tmp > 0:
+                T = i + 1
+                break
+        return T
+
+    def __call__(self, results):
+        data = results['data']
+
+        C, T, V, M = data.shape
+        T = self.get_frame_num(data)
+        if T == self.window_size:
+            data_pad = data[:, :self.window_size, :, :]
+        elif T < self.window_size:
+            begin = random.randint(
+                0, self.window_size - T) if self.random_pad else 0
+            data_pad = np.zeros((C, self.window_size, V, M))
+            data_pad[:, begin:begin + T, :, :] = data[:, :T, :, :]
+        else:
+            if self.random_pad:
+                index = np.random.choice(
+                    T, self.window_size, replace=False).astype('int64')
+            else:
+                index = np.linspace(0, T, self.window_size).astype("int64")
+            data_pad = data[:, index, :, :]
+
+        results['data'] = data_pad
+        return results
+
+
+@PIPELINES.register()
+class SkeletonNorm(object):
+    """
+    Normalize skeleton feature.
+    Args:
+        aixs: dimensions of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default: 2.
+    """
+
+    def __init__(self, axis=2, squeeze=False):
+        self.axis = axis
+        self.squeeze = squeeze
+
+    def __call__(self, results):
+        data = results['data']
+
+        # Centralization
+        data = data - data[:, :, 8:9, :]
+        data = data[:self.axis, :, :, :]  # get (x,y) from (x,y, acc)
+        C, T, V, M = data.shape
+        if self.squeeze:
+            data = data.reshape((C, T, V))  # M = 1
+
+        results['data'] = data.astype('float32')
+        if 'label' in results:
+            label = results['label']
+            results['label'] = np.expand_dims(label, 0).astype('int64')
+        return results
+
+
+@PIPELINES.register()
+class Iden(object):
+    """
+    Wrapper Pipeline
+    """
+
+    def __init__(self, label_expand=True):
+        self.label_expand = label_expand
+
+    def __call__(self, results):
+        data = results['data']
+        results['data'] = data.astype('float32')
+
+        if 'label' in results and self.label_expand:
+            label = results['label']
+            results['label'] = np.expand_dims(label, 0).astype('int64')
+        return results
+
+
+@PIPELINES.register()
+class RandomRotation(object):
+    """
+    Random rotation sketeton.
+    Args:
+        argument: bool, if rotation.
+        theta: float, rotation rate.
+    """
+
+    def __init__(self, argument, theta=0.3):
+        self.theta = theta
+        self.argument = argument
+
+    def _rot(self, rot):
+        """
+        rot: T,3
+        """
+        cos_r, sin_r = np.cos(rot), np.sin(rot)  # T,3
+        zeros = np.zeros((rot.shape[0], 1))  # T,1
+        ones = np.ones((rot.shape[0], 1))  # T,1
+
+        r1 = np.stack((ones, zeros, zeros), axis=-1)  # T,1,3
+        rx2 = np.stack((zeros, cos_r[:, 0:1], sin_r[:, 0:1]), axis=-1)  # T,1,3
+        rx3 = np.stack((zeros, -sin_r[:, 0:1], cos_r[:, 0:1]), axis=-1)  # T,1,3
+        rx = np.concatenate((r1, rx2, rx3), axis=1)  # T,3,3
+
+        ry1 = np.stack((cos_r[:, 1:2], zeros, -sin_r[:, 1:2]), axis=-1)
+        r2 = np.stack((zeros, ones, zeros), axis=-1)
+        ry3 = np.stack((sin_r[:, 1:2], zeros, cos_r[:, 1:2]), axis=-1)
+        ry = np.concatenate((ry1, r2, ry3), axis=1)
+
+        rz1 = np.stack((cos_r[:, 2:3], sin_r[:, 2:3], zeros), axis=-1)
+        r3 = np.stack((zeros, zeros, ones), axis=-1)
+        rz2 = np.stack((-sin_r[:, 2:3], cos_r[:, 2:3], zeros), axis=-1)
+        rz = np.concatenate((rz1, rz2, r3), axis=1)
+
+        rot = np.matmul(np.matmul(rz, ry), rx)
+        return rot
+
+    def __call__(self, results):
+        # C,T,V,M
+        data = results['data']
+        if self.argument:
+            C, T, V, M = data.shape
+            data_numpy = np.transpose(data, (1, 0, 2, 3)).conjugate().reshape(
+                T, C, V * M)  # T,3,V*M
+            rot = np.random.uniform(-self.theta, self.theta, 3)
+            rot = np.stack(
+                [
+                    rot,
+                ] * T, axis=0)
+            rot = self._rot(rot)  # T,3,3
+            data_numpy = np.matmul(rot, data_numpy)
+            data_numpy = data_numpy.reshape(T, C, V, M)
+            data_numpy = np.transpose(data_numpy, (1, 0, 2, 3))
+            data = data_numpy
+        results['data'] = data.astype(np.float32)
+        return results
+
+
+@PIPELINES.register()
+class SketeonCropSample(object):
+    """
+    Sketeon Crop Sampler.
+    Args:
+        crop_model: str, crop model, support: ['center'].
+        p_interval: list, crop len
+        window_size: int, sample windows size.
+    """
+
+    def __init__(self, window_size, crop_model='center', p_interval=1):
+        assert crop_model in ['center'], "Don't support :" + crop_model
+
+        self.crop_model = crop_model
+        self.window_size = window_size
+        self.p_interval = p_interval
+
+    def __call__(self, results):
+        if self.crop_model == 'center':
+            # input: C,T,V,M
+            data = results['data']
+            valid_frame_num = np.sum(data.sum(0).sum(-1).sum(-1) != 0)
+
+            C, T, V, M = data.shape
+            begin = 0
+            end = valid_frame_num
+            valid_size = end - begin
+
+            #crop
+            if len(self.p_interval) == 1:
+                p = self.p_interval[0]
+                bias = int((1 - p) * valid_size / 2)
+                data = data[:, begin + bias:end - bias, :, :]  # center_crop
+                cropped_length = data.shape[1]
+            else:
+                p = np.random.rand(1) * (self.p_interval[1] - self.p_interval[0]
+                                         ) + self.p_interval[0]
+                # constraint cropped_length lower bound as 64
+                cropped_length = np.minimum(
+                    np.maximum(int(np.floor(valid_size * p)), 64), valid_size)
+                bias = np.random.randint(0, valid_size - cropped_length + 1)
+                data = data[:, begin + bias:begin + bias + cropped_length, :, :]
+
+            # resize
+            data = np.transpose(data, (0, 2, 3, 1)).conjugate().reshape(
+                C * V * M, cropped_length)
+            data = data[None, None, :, :]
+            # could perform both up sample and down sample
+            data_tensor = paddle.to_tensor(data)
+            data_tensor = F.interpolate(
+                data_tensor,
+                size=(C * V * M, self.window_size),
+                mode='bilinear',
+                align_corners=False).squeeze()
+            data = paddle.transpose(
+                paddle.reshape(data_tensor, (C, V, M, self.window_size)),
+                (0, 3, 1, 2)).numpy()
+        else:
+            raise NotImplementedError
+        results['data'] = data
+        return results
+
+
+@PIPELINES.register()
+class SketeonModalityTransform(object):
+    """
+    Sketeon Crop Sampler.
+    Args:
+        crop_model: str, crop model, support: ['center'].
+        p_interval: list, crop len
+        window_size: int, sample windows size.
+    """
+
+    def __init__(self, bone, motion, joint=True, graph='ntu_rgb_d'):
+
+        self.joint = joint
+        self.bone = bone
+        self.motion = motion
+        self.graph = graph
+        if self.graph == "ntu_rgb_d":
+            self.bone_pairs = ((1, 2), (2, 21), (3, 21), (4, 3), (5, 21),
+                               (6, 5), (7, 6), (8, 7), (9, 21), (10, 9),
+                               (11, 10), (12, 11), (13, 1), (14, 13), (15, 14),
+                               (16, 15), (17, 1), (18, 17), (19, 18), (20, 19),
+                               (22, 23), (21, 21), (23, 8), (24, 25), (25, 12))
+        else:
+            raise NotImplementedError
+
+    def __call__(self, results):
+        if self.joint:
+            return results
+        data_numpy = results['data']
+        if self.bone:
+            bone_data_numpy = np.zeros_like(data_numpy)
+            for v1, v2 in self.bone_pairs:
+                bone_data_numpy[:, :, v1 -
+                                1] = data_numpy[:, :, v1 -
+                                                1] - data_numpy[:, :, v2 - 1]
+            data_numpy = bone_data_numpy
+        if self.motion:
+            data_numpy[:, :-1] = data_numpy[:, 1:] - data_numpy[:, :-1]
+            data_numpy[:, -1] = 0
+        results['data'] = data_numpy
+        return results
+
+
+@PIPELINES.register()
+class UniformSampleFrames:
+    """Uniformly sample frames from the video.
+
+    To sample an n-frame clip from the video. UniformSampleFrames basically
+    divide the video into n segments of equal length and randomly sample one
+    frame from each segment. To make the testing results reproducible, a
+    random seed is set during testing, to make the sampling results
+    deterministic.
+
+    Required keys are "total_frames", "start_index" , added or modified keys
+    are "frame_inds", "clip_len", "frame_interval" and "num_clips".
+
+    Args:
+        clip_len (int): Frames of each sampled output clip.
+        num_clips (int): Number of clips to be sampled. Default: 1.
+        test_mode (bool): Store True when building test or validation dataset.
+            Default: False.
+        seed (int): The random seed used during test time. Default: 255.
+    """
+
+    def __init__(self, clip_len, num_clips=1, test_mode=False, seed=255):
+
+        self.clip_len = clip_len
+        self.num_clips = num_clips
+        self.test_mode = test_mode
+        self.seed = seed
+
+    def _get_train_clips(self, num_frames, clip_len):
+        """Uniformly sample indices for training clips.
+
+        Args:
+            num_frames (int): The number of frames.
+            clip_len (int): The length of the clip.
+        """
+
+        assert self.num_clips == 1
+        if num_frames < clip_len:
+            start = np.random.randint(0, num_frames)
+            inds = np.arange(start, start + clip_len)
+        elif clip_len <= num_frames < 2 * clip_len:
+            basic = np.arange(clip_len)
+            inds = np.random.choice(
+                clip_len + 1, num_frames - clip_len, replace=False)
+            offset = np.zeros(clip_len + 1, dtype=np.int64)
+            offset[inds] = 1
+            offset = np.cumsum(offset)
+            inds = basic + offset[:-1]
+        else:
+            bids = np.array(
+                [i * num_frames // clip_len for i in range(clip_len + 1)])
+            bsize = np.diff(bids)
+            bst = bids[:clip_len]
+            offset = np.random.randint(bsize)
+            inds = bst + offset
+        return inds
+
+    def _get_test_clips(self, num_frames, clip_len):
+        """Uniformly sample indices for testing clips.
+
+        Args:
+            num_frames (int): The number of frames.
+            clip_len (int): The length of the clip.
+        """
+
+        np.random.seed(self.seed)
+        if num_frames < clip_len:
+            # Then we use a simple strategy
+            if num_frames < self.num_clips:
+                start_inds = list(range(self.num_clips))
+            else:
+                start_inds = [
+                    i * num_frames // self.num_clips
+                    for i in range(self.num_clips)
+                ]
+            inds = np.concatenate(
+                [np.arange(i, i + clip_len) for i in start_inds])
+        elif clip_len <= num_frames < clip_len * 2:
+            all_inds = []
+            for i in range(self.num_clips):
+                basic = np.arange(clip_len)
+                inds = np.random.choice(
+                    clip_len + 1, num_frames - clip_len, replace=False)
+                offset = np.zeros(clip_len + 1, dtype=np.int64)
+                offset[inds] = 1
+                offset = np.cumsum(offset)
+                inds = basic + offset[:-1]
+                all_inds.append(inds)
+            inds = np.concatenate(all_inds)
+        else:
+            bids = np.array(
+                [i * num_frames // clip_len for i in range(clip_len + 1)])
+            bsize = np.diff(bids)
+            bst = bids[:clip_len]
+            all_inds = []
+            for i in range(self.num_clips):
+                offset = np.random.randint(bsize)
+                all_inds.append(bst + offset)
+            inds = np.concatenate(all_inds)
+        return inds
+
+    def __call__(self, results):
+        num_frames = results['total_frames']
+
+        if self.test_mode:
+            inds = self._get_test_clips(num_frames, self.clip_len)
+        else:
+            inds = self._get_train_clips(num_frames, self.clip_len)
+
+        inds = np.mod(inds, num_frames)
+        start_index = results['start_index']
+        inds = inds + start_index
+
+        results['frame_inds'] = inds.astype(np.int)
+        results['clip_len'] = self.clip_len
+        results['frame_interval'] = None
+        results['num_clips'] = self.num_clips
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'clip_len={self.clip_len}, '
+                    f'num_clips={self.num_clips}, '
+                    f'test_mode={self.test_mode}, '
+                    f'seed={self.seed})')
+        return repr_str
+
+
+@PIPELINES.register()
+class PoseDecode:
+    """Load and decode pose with given indices.
+
+    Required keys are "keypoint", "frame_inds" (optional), "keypoint_score"
+    (optional), added or modified keys are "keypoint", "keypoint_score" (if
+    applicable).
+    """
+
+    @staticmethod
+    def _load_kp(kp, frame_inds):
+        """Load keypoints given frame indices.
+
+        Args:
+            kp (np.ndarray): The keypoint coordinates.
+            frame_inds (np.ndarray): The frame indices.
+        """
+
+        return [x[frame_inds].astype(np.float32) for x in kp]
+
+    @staticmethod
+    def _load_kpscore(kpscore, frame_inds):
+        """Load keypoint scores given frame indices.
+
+        Args:
+            kpscore (np.ndarray): The confidence scores of keypoints.
+            frame_inds (np.ndarray): The frame indices.
+        """
+
+        return [x[frame_inds].astype(np.float32) for x in kpscore]
+
+    def __call__(self, results):
+
+        if 'frame_inds' not in results:
+            results['frame_inds'] = np.arange(results['total_frames'])
+
+        if results['frame_inds'].ndim != 1:
+            results['frame_inds'] = np.squeeze(results['frame_inds'])
+
+        offset = results.get('offset', 0)
+        frame_inds = results['frame_inds'] + offset
+
+        if 'keypoint_score' in results:
+            kpscore = results['keypoint_score']
+            results['keypoint_score'] = kpscore[:, frame_inds].astype(
+                np.float32)
+
+        if 'keypoint' in results:
+            results['keypoint'] = results['keypoint'][:, frame_inds].astype(
+                np.float32)
+
+        return results
+
+    def __repr__(self):
+        repr_str = f'{self.__class__.__name__}()'
+        return repr_str
+
+
+@PIPELINES.register()
+class PoseCompact:
+    """Convert the coordinates of keypoints to make it more compact.
+    Specifically, it first find a tight bounding box that surrounds all joints
+    in each frame, then we expand the tight box by a given padding ratio. For
+    example, if 'padding == 0.25', then the expanded box has unchanged center,
+    and 1.25x width and height.
+
+    Required keys in results are "img_shape", "keypoint", add or modified keys
+    are "img_shape", "keypoint", "crop_quadruple".
+
+    Args:
+        padding (float): The padding size. Default: 0.25.
+        threshold (int): The threshold for the tight bounding box. If the width
+            or height of the tight bounding box is smaller than the threshold,
+            we do not perform the compact operation. Default: 10.
+        hw_ratio (float | tuple[float] | None): The hw_ratio of the expanded
+            box. Float indicates the specific ratio and tuple indicates a
+            ratio range. If set as None, it means there is no requirement on
+            hw_ratio. Default: None.
+        allow_imgpad (bool): Whether to allow expanding the box outside the
+            image to meet the hw_ratio requirement. Default: True.
+
+    Returns:
+        type: Description of returned object.
+    """
+
+    def __init__(self,
+                 padding=0.25,
+                 threshold=10,
+                 hw_ratio=None,
+                 allow_imgpad=True):
+
+        self.padding = padding
+        self.threshold = threshold
+        if hw_ratio is not None:
+            hw_ratio = _pair(hw_ratio)
+
+        self.hw_ratio = hw_ratio
+
+        self.allow_imgpad = allow_imgpad
+        assert self.padding >= 0
+
+    def _combine_quadruple(self, a, b):
+        return (a[0] + a[2] * b[0], a[1] + a[3] * b[1], a[2] * b[2],
+                a[3] * b[3])
+
+    def __call__(self, results):
+        img_shape = results['img_shape']
+        h, w = img_shape
+        kp = results['keypoint']
+
+        # Make NaN zero
+        kp[np.isnan(kp)] = 0.
+        kp_x = kp[..., 0]
+        kp_y = kp[..., 1]
+
+        min_x = np.min(kp_x[kp_x != 0], initial=np.Inf)
+        min_y = np.min(kp_y[kp_y != 0], initial=np.Inf)
+        max_x = np.max(kp_x[kp_x != 0], initial=-np.Inf)
+        max_y = np.max(kp_y[kp_y != 0], initial=-np.Inf)
+
+        # The compact area is too small
+        if max_x - min_x < self.threshold or max_y - min_y < self.threshold:
+            return results
+
+        center = ((max_x + min_x) / 2, (max_y + min_y) / 2)
+        half_width = (max_x - min_x) / 2 * (1 + self.padding)
+        half_height = (max_y - min_y) / 2 * (1 + self.padding)
+
+        if self.hw_ratio is not None:
+            half_height = max(self.hw_ratio[0] * half_width, half_height)
+            half_width = max(1 / self.hw_ratio[1] * half_height, half_width)
+
+        min_x, max_x = center[0] - half_width, center[0] + half_width
+        min_y, max_y = center[1] - half_height, center[1] + half_height
+
+        # hot update
+        if not self.allow_imgpad:
+            min_x, min_y = int(max(0, min_x)), int(max(0, min_y))
+            max_x, max_y = int(min(w, max_x)), int(min(h, max_y))
+        else:
+            min_x, min_y = int(min_x), int(min_y)
+            max_x, max_y = int(max_x), int(max_y)
+
+        kp_x[kp_x != 0] -= min_x
+        kp_y[kp_y != 0] -= min_y
+
+        new_shape = (max_y - min_y, max_x - min_x)
+        results['img_shape'] = new_shape
+
+        # the order is x, y, w, h (in [0, 1]), a tuple
+        crop_quadruple = results.get('crop_quadruple', (0., 0., 1., 1.))
+        new_crop_quadruple = (min_x / w, min_y / h, (max_x - min_x) / w,
+                              (max_y - min_y) / h)
+        crop_quadruple = self._combine_quadruple(crop_quadruple,
+                                                 new_crop_quadruple)
+        results['crop_quadruple'] = crop_quadruple
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}(padding={self.padding}, '
+                    f'threshold={self.threshold}, '
+                    f'hw_ratio={self.hw_ratio}, '
+                    f'allow_imgpad={self.allow_imgpad})')
+        return repr_str
+
+
+class CropBase:
+    @staticmethod
+    def _crop_kps(kps, crop_bbox):
+        return kps - crop_bbox[:2]
+
+    @staticmethod
+    def _crop_imgs(imgs, crop_bbox):
+        x1, y1, x2, y2 = crop_bbox
+        return [img[y1:y2, x1:x2] for img in imgs]
+
+    @staticmethod
+    def _box_crop(box, crop_bbox):
+        """Crop the bounding boxes according to the crop_bbox.
+
+        Args:
+            box (np.ndarray): The bounding boxes.
+            crop_bbox(np.ndarray): The bbox used to crop the original image.
+        """
+
+        x1, y1, x2, y2 = crop_bbox
+        img_w, img_h = x2 - x1, y2 - y1
+
+        box_ = box.copy()
+        box_[..., 0::2] = np.clip(box[..., 0::2] - x1, 0, img_w - 1)
+        box_[..., 1::2] = np.clip(box[..., 1::2] - y1, 0, img_h - 1)
+        return box_
+
+    def _all_box_crop(self, results, crop_bbox):
+        """Crop the gt_bboxes and proposals in results according to crop_bbox.
+
+        Args:
+            results (dict): All information about the sample, which contain
+                'gt_bboxes' and 'proposals' (optional).
+            crop_bbox(np.ndarray): The bbox used to crop the original image.
+        """
+        results['gt_bboxes'] = self._box_crop(results['gt_bboxes'], crop_bbox)
+        if 'proposals' in results and results['proposals'] is not None:
+            assert results['proposals'].shape[1] == 4
+            results['proposals'] = self._box_crop(results['proposals'],
+                                                  crop_bbox)
+        return results
+
+    def __call__(self, results):
+        raise NotImplementedError
+
+
+@PIPELINES.register()
+class RandomResizedCrop_V2(CropBase):
+    """Random crop that specifics the area and height-weight ratio range.
+
+    Required keys in results are "img_shape", "crop_bbox", "imgs" (optional),
+    "keypoint" (optional), added or modified keys are "imgs", "keypoint",
+    "crop_bbox" and "lazy"; Required keys in "lazy" are "flip", "crop_bbox",
+    added or modified key is "crop_bbox".
+
+    Args:
+        area_range (Tuple[float]): The candidate area scales range of
+            output cropped images. Default: (0.08, 1.0).
+        aspect_ratio_range (Tuple[float]): The candidate aspect ratio range of
+            output cropped images. Default: (3 / 4, 4 / 3).
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+
+    def __init__(self,
+                 area_range=(0.08, 1.0),
+                 aspect_ratio_range=(3 / 4, 4 / 3),
+                 lazy=False):
+        self.area_range = eval(area_range)
+        self.aspect_ratio_range = aspect_ratio_range
+        self.lazy = lazy
+        if not is_tuple_of(self.area_range, float):
+            raise TypeError(f'Area_range must be a tuple of float, '
+                            f'but got {type(area_range)}')
+        if not is_tuple_of(self.aspect_ratio_range, float):
+            raise TypeError(f'Aspect_ratio_range must be a tuple of float, '
+                            f'but got {type(aspect_ratio_range)}')
+
+    @staticmethod
+    def get_crop_bbox(img_shape,
+                      area_range,
+                      aspect_ratio_range,
+                      max_attempts=10):
+        """Get a crop bbox given the area range and aspect ratio range.
+
+        Args:
+            img_shape (Tuple[int]): Image shape
+            area_range (Tuple[float]): The candidate area scales range of
+                output cropped images. Default: (0.08, 1.0).
+            aspect_ratio_range (Tuple[float]): The candidate aspect
+                ratio range of output cropped images. Default: (3 / 4, 4 / 3).
+                max_attempts (int): The maximum of attempts. Default: 10.
+            max_attempts (int): Max attempts times to generate random candidate
+                bounding box. If it doesn't qualified one, the center bounding
+                box will be used.
+        Returns:
+            (list[int]) A random crop bbox within the area range and aspect
+            ratio range.
+        """
+        assert 0 < area_range[0] <= area_range[1] <= 1
+        assert 0 < aspect_ratio_range[0] <= aspect_ratio_range[1]
+
+        img_h, img_w = img_shape
+        area = img_h * img_w
+
+        min_ar, max_ar = aspect_ratio_range
+        aspect_ratios = np.exp(
+            np.random.uniform(
+                np.log(min_ar), np.log(max_ar), size=max_attempts))
+        target_areas = np.random.uniform(*area_range, size=max_attempts) * area
+        candidate_crop_w = np.round(np.sqrt(
+            target_areas * aspect_ratios)).astype(np.int32)
+        candidate_crop_h = np.round(np.sqrt(
+            target_areas / aspect_ratios)).astype(np.int32)
+
+        for i in range(max_attempts):
+            crop_w = candidate_crop_w[i]
+            crop_h = candidate_crop_h[i]
+            if crop_h <= img_h and crop_w <= img_w:
+                x_offset = random.randint(0, img_w - crop_w)
+                y_offset = random.randint(0, img_h - crop_h)
+                return x_offset, y_offset, x_offset + crop_w, y_offset + crop_h
+
+        # Fallback
+        crop_size = min(img_h, img_w)
+        x_offset = (img_w - crop_size) // 2
+        y_offset = (img_h - crop_size) // 2
+        return x_offset, y_offset, x_offset + crop_size, y_offset + crop_size
+
+    def __call__(self, results):
+        """Performs the RandomResizeCrop augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        _init_lazy_if_proper(results, self.lazy)
+        if 'keypoint' in results:
+            assert not self.lazy, ('Keypoint Augmentations are not compatible '
+                                   'with lazy == True')
+
+        img_h, img_w = results['img_shape']
+
+        left, top, right, bottom = self.get_crop_bbox(
+            (img_h, img_w), self.area_range, self.aspect_ratio_range)
+        new_h, new_w = bottom - top, right - left
+
+        if 'crop_quadruple' not in results:
+            results['crop_quadruple'] = np.array(
+                [0, 0, 1, 1],  # x, y, w, h
+                dtype=np.float32)
+
+        x_ratio, y_ratio = left / img_w, top / img_h
+        w_ratio, h_ratio = new_w / img_w, new_h / img_h
+
+        old_crop_quadruple = results['crop_quadruple']
+        old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1]
+        old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3]
+        new_crop_quadruple = [
+            old_x_ratio + x_ratio * old_w_ratio,
+            old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio,
+            h_ratio * old_h_ratio
+        ]
+        results['crop_quadruple'] = np.array(
+            new_crop_quadruple, dtype=np.float32)
+
+        crop_bbox = np.array([left, top, right, bottom])
+        results['crop_bbox'] = crop_bbox
+        results['img_shape'] = (new_h, new_w)
+
+        if not self.lazy:
+            if 'keypoint' in results:
+                results['keypoint'] = self._crop_kps(results['keypoint'],
+                                                     crop_bbox)
+            if 'imgs' in results:
+                results['imgs'] = self._crop_imgs(results['imgs'], crop_bbox)
+        else:
+            lazyop = results['lazy']
+            if lazyop['flip']:
+                raise NotImplementedError('Put Flip at last for now')
+
+            # record crop_bbox in lazyop dict to ensure only crop once in Fuse
+            lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox']
+            left = left * (lazy_right - lazy_left) / img_w
+            right = right * (lazy_right - lazy_left) / img_w
+            top = top * (lazy_bottom - lazy_top) / img_h
+            bottom = bottom * (lazy_bottom - lazy_top) / img_h
+            lazyop['crop_bbox'] = np.array(
+                [(lazy_left + left), (lazy_top + top), (lazy_left + right),
+                 (lazy_top + bottom)],
+                dtype=np.float32)
+
+        if 'gt_bboxes' in results:
+            assert not self.lazy
+            results = self._all_box_crop(results, results['crop_bbox'])
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'area_range={self.area_range}, '
+                    f'aspect_ratio_range={self.aspect_ratio_range}, '
+                    f'lazy={self.lazy})')
+        return repr_str
+
+
+def is_seq_of(seq, expected_type, seq_type=None):
+    """Check whether it is a sequence of some type.
+
+    Args:
+        seq (Sequence): The sequence to be checked.
+        expected_type (type): Expected type of sequence items.
+        seq_type (type, optional): Expected sequence type.
+
+    Returns:
+        bool: Whether the sequence is valid.
+    """
+    if seq_type is None:
+        exp_seq_type = abc.Sequence
+    else:
+        assert isinstance(seq_type, type)
+        exp_seq_type = seq_type
+    if not isinstance(seq, exp_seq_type):
+        return False
+    for item in seq:
+        if not isinstance(item, expected_type):
+            return False
+    return True
+
+
+def is_tuple_of(seq, expected_type):
+    """Check whether it is a tuple of some type.
+
+    A partial method of :func:`is_seq_of`.
+    """
+    return is_seq_of(seq, expected_type, seq_type=tuple)
+
+
+@PIPELINES.register()
+class CenterCrop_V2(CropBase):
+    """Crop the center area from images.
+
+    Required keys are "img_shape", "imgs" (optional), "keypoint" (optional),
+    added or modified keys are "imgs", "keypoint", "crop_bbox", "lazy" and
+    "img_shape". Required keys in "lazy" is "crop_bbox", added or modified key
+    is "crop_bbox".
+
+    Args:
+        crop_size (int | tuple[int]): (w, h) of crop size.
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+
+    def __init__(self, crop_size, lazy=False):
+        self.crop_size = _pair(crop_size)
+        self.lazy = lazy
+        if not is_tuple_of(self.crop_size, int):
+            raise TypeError(f'Crop_size must be int or tuple of int, '
+                            f'but got {type(crop_size)}')
+
+    def __call__(self, results):
+        """Performs the CenterCrop augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        _init_lazy_if_proper(results, self.lazy)
+        if 'keypoint' in results:
+            assert not self.lazy, ('Keypoint Augmentations are not compatible '
+                                   'with lazy == True')
+
+        img_h, img_w = results['img_shape']
+        crop_w, crop_h = self.crop_size
+
+        left = (img_w - crop_w) // 2
+        top = (img_h - crop_h) // 2
+        right = left + crop_w
+        bottom = top + crop_h
+        new_h, new_w = bottom - top, right - left
+
+        crop_bbox = np.array([left, top, right, bottom])
+        results['crop_bbox'] = crop_bbox
+        results['img_shape'] = (new_h, new_w)
+
+        if 'crop_quadruple' not in results:
+            results['crop_quadruple'] = np.array(
+                [0, 0, 1, 1],  # x, y, w, h
+                dtype=np.float32)
+
+        x_ratio, y_ratio = left / img_w, top / img_h
+        w_ratio, h_ratio = new_w / img_w, new_h / img_h
+
+        old_crop_quadruple = results['crop_quadruple']
+        old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1]
+        old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3]
+        new_crop_quadruple = [
+            old_x_ratio + x_ratio * old_w_ratio,
+            old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio,
+            h_ratio * old_h_ratio
+        ]
+        results['crop_quadruple'] = np.array(
+            new_crop_quadruple, dtype=np.float32)
+
+        if not self.lazy:
+            if 'keypoint' in results:
+                results['keypoint'] = self._crop_kps(results['keypoint'],
+                                                     crop_bbox)
+            if 'imgs' in results:
+                results['imgs'] = self._crop_imgs(results['imgs'], crop_bbox)
+        else:
+            lazyop = results['lazy']
+            if lazyop['flip']:
+                raise NotImplementedError('Put Flip at last for now')
+
+            # record crop_bbox in lazyop dict to ensure only crop once in Fuse
+            lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox']
+            left = left * (lazy_right - lazy_left) / img_w
+            right = right * (lazy_right - lazy_left) / img_w
+            top = top * (lazy_bottom - lazy_top) / img_h
+            bottom = bottom * (lazy_bottom - lazy_top) / img_h
+            lazyop['crop_bbox'] = np.array(
+                [(lazy_left + left), (lazy_top + top), (lazy_left + right),
+                 (lazy_top + bottom)],
+                dtype=np.float32)
+
+        if 'gt_bboxes' in results:
+            assert not self.lazy
+            results = self._all_box_crop(results, results['crop_bbox'])
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}(crop_size={self.crop_size}, '
+                    f'lazy={self.lazy})')
+        return repr_str
+
+
+@PIPELINES.register()
+class Flip_V2:
+    """Flip the input images with a probability.
+
+    Reverse the order of elements in the given imgs with a specific direction.
+    The shape of the imgs is preserved, but the elements are reordered.
+
+    Required keys are "img_shape", "modality", "imgs" (optional), "keypoint"
+    (optional), added or modified keys are "imgs", "keypoint", "lazy" and
+    "flip_direction". Required keys in "lazy" is None, added or modified key
+    are "flip" and "flip_direction". The Flip augmentation should be placed
+    after any cropping / reshaping augmentations, to make sure crop_quadruple
+    is calculated properly.
+
+    Args:
+        flip_ratio (float): Probability of implementing flip. Default: 0.5.
+        direction (str): Flip imgs horizontally or vertically. Options are
+            "horizontal" | "vertical". Default: "horizontal".
+        flip_label_map (Dict[int, int] | None): Transform the label of the
+            flipped image with the specific label. Default: None.
+        left_kp (list[int]): Indexes of left keypoints, used to flip keypoints.
+            Default: None.
+        right_kp (list[ind]): Indexes of right keypoints, used to flip
+            keypoints. Default: None.
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+    _directions = ['horizontal', 'vertical']
+
+    def __init__(self,
+                 flip_ratio=0.5,
+                 direction='horizontal',
+                 flip_label_map=None,
+                 left_kp=None,
+                 right_kp=None,
+                 lazy=False):
+        if direction not in self._directions:
+            raise ValueError(f'Direction {direction} is not supported. '
+                             f'Currently support ones are {self._directions}')
+        self.flip_ratio = flip_ratio
+        self.direction = direction
+        self.flip_label_map = flip_label_map
+        self.left_kp = left_kp
+        self.right_kp = right_kp
+        self.lazy = lazy
+
+    def _flip_imgs(self, imgs, modality):
+        _ = [imflip_(img, self.direction) for img in imgs]
+        lt = len(imgs)
+        if modality == 'Flow':
+            # The 1st frame of each 2 frames is flow-x
+            for i in range(0, lt, 2):
+                imgs[i] = iminvert(imgs[i])
+        return imgs
+
+    def _flip_kps(self, kps, kpscores, img_width):
+        kp_x = kps[..., 0]
+        kp_x[kp_x != 0] = img_width - kp_x[kp_x != 0]
+        new_order = list(range(kps.shape[2]))
+        if self.left_kp is not None and self.right_kp is not None:
+            for left, right in zip(self.left_kp, self.right_kp):
+                new_order[left] = right
+                new_order[right] = left
+        kps = kps[:, :, new_order]
+        if kpscores is not None:
+            kpscores = kpscores[:, :, new_order]
+        return kps, kpscores
+
+    @staticmethod
+    def _box_flip(box, img_width):
+        """Flip the bounding boxes given the width of the image.
+
+        Args:
+            box (np.ndarray): The bounding boxes.
+            img_width (int): The img width.
+        """
+        box_ = box.copy()
+        box_[..., 0::4] = img_width - box[..., 2::4]
+        box_[..., 2::4] = img_width - box[..., 0::4]
+        return box_
+
+    def __call__(self, results):
+        """Performs the Flip augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        _init_lazy_if_proper(results, self.lazy)
+        if 'keypoint' in results:
+            assert not self.lazy, ('Keypoint Augmentations are not compatible '
+                                   'with lazy == True')
+            assert self.direction == 'horizontal', (
+                'Only horizontal flips are'
+                'supported for human keypoints')
+
+        modality = results['modality']
+        if modality == 'Flow':
+            assert self.direction == 'horizontal'
+
+        flip = np.random.rand() < self.flip_ratio
+
+        results['flip'] = flip
+        results['flip_direction'] = self.direction
+        img_width = results['img_shape'][1]
+
+        if self.flip_label_map is not None and flip:
+            results['label'] = self.flip_label_map.get(results['label'],
+                                                       results['label'])
+
+        if not self.lazy:
+            if flip:
+                if 'imgs' in results:
+                    results['imgs'] = self._flip_imgs(results['imgs'], modality)
+                if 'keypoint' in results:
+                    kp = results['keypoint']
+                    kpscore = results.get('keypoint_score', None)
+                    kp, kpscore = self._flip_kps(kp, kpscore, img_width)
+                    results['keypoint'] = kp
+                    if 'keypoint_score' in results:
+                        results['keypoint_score'] = kpscore
+        else:
+            lazyop = results['lazy']
+            if lazyop['flip']:
+                raise NotImplementedError('Use one Flip please')
+            lazyop['flip'] = flip
+            lazyop['flip_direction'] = self.direction
+
+        if 'gt_bboxes' in results and flip:
+            assert not self.lazy and self.direction == 'horizontal'
+            width = results['img_shape'][1]
+            results['gt_bboxes'] = self._box_flip(results['gt_bboxes'], width)
+            if 'proposals' in results and results['proposals'] is not None:
+                assert results['proposals'].shape[1] == 4
+                results['proposals'] = self._box_flip(results['proposals'],
+                                                      width)
+
+        return results
+
+    def __repr__(self):
+        repr_str = (
+            f'{self.__class__.__name__}('
+            f'flip_ratio={self.flip_ratio}, direction={self.direction}, '
+            f'flip_label_map={self.flip_label_map}, lazy={self.lazy})')
+        return repr_str
+
+
+@PIPELINES.register()
+class FormatShape:
+    """Format final imgs shape to the given input_format.
+
+    Required keys are "imgs", "num_clips" and "clip_len", added or modified
+    keys are "imgs" and "input_shape".
+
+    Args:
+        input_format (str): Define the final imgs format.
+        collapse (bool): To collpase input_format N... to ... (NCTHW to CTHW,
+            etc.) if N is 1. Should be set as True when training and testing
+            detectors. Default: False.
+    """
+
+    def __init__(self, input_format, collapse=False):
+        self.input_format = input_format
+        self.collapse = collapse
+        if self.input_format not in ['NCTHW', 'NCHW', 'NCHW_Flow', 'NPTCHW']:
+            raise ValueError(
+                f'The input format {self.input_format} is invalid.')
+
+    def __call__(self, results):
+        """Performs the FormatShape formating.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        if not isinstance(results['imgs'], np.ndarray):
+            results['imgs'] = np.array(results['imgs'])
+        imgs = results['imgs']
+        # [M x H x W x C]
+        # M = 1 * N_crops * N_clips * L
+        if self.collapse:
+            assert results['num_clips'] == 1
+
+        if self.input_format == 'NCTHW':
+            num_clips = results['num_clips']
+            clip_len = results['clip_len']
+
+            imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])
+            # N_crops x N_clips x L x H x W x C
+            imgs = np.transpose(imgs, (0, 1, 5, 2, 3, 4))
+            # N_crops x N_clips x C x L x H x W
+            imgs = imgs.reshape((-1, ) + imgs.shape[2:])
+            # M' x C x L x H x W
+            # M' = N_crops x N_clips
+        elif self.input_format == 'NCHW':
+            imgs = np.transpose(imgs, (0, 3, 1, 2))
+            # M x C x H x W
+        elif self.input_format == 'NCHW_Flow':
+            num_clips = results['num_clips']
+            clip_len = results['clip_len']
+            imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])
+            # N_crops x N_clips x L x H x W x C
+            imgs = np.transpose(imgs, (0, 1, 2, 5, 3, 4))
+            # N_crops x N_clips x L x C x H x W
+            imgs = imgs.reshape((-1, imgs.shape[2] * imgs.shape[3]) +
+                                imgs.shape[4:])
+            # M' x C' x H x W
+            # M' = N_crops x N_clips
+            # C' = L x C
+        elif self.input_format == 'NPTCHW':
+            num_proposals = results['num_proposals']
+            num_clips = results['num_clips']
+            clip_len = results['clip_len']
+            imgs = imgs.reshape((num_proposals, num_clips * clip_len) +
+                                imgs.shape[1:])
+            # P x M x H x W x C
+            # M = N_clips x L
+            imgs = np.transpose(imgs, (0, 1, 4, 2, 3))
+            # P x M x C x H x W
+
+        if self.collapse:
+            assert imgs.shape[0] == 1
+            imgs = imgs.squeeze(0)
+
+        results['imgs'] = imgs
+        results['input_shape'] = imgs.shape
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f"(input_format='{self.input_format}')"
+        return repr_str
+
+
+@PIPELINES.register()
+class Collect:
+    """Collect data from the loader relevant to the specific task.
+
+    This keeps the items in ``keys`` as it is, and collect items in
+    ``meta_keys`` into a meta item called ``meta_name``.This is usually
+    the last stage of the data loader pipeline.
+    For example, when keys='imgs', meta_keys=('filename', 'label',
+    'original_shape'), meta_name='img_metas', the results will be a dict with
+    keys 'imgs' and 'img_metas', where 'img_metas' is a DataContainer of
+    another dict with keys 'filename', 'label', 'original_shape'.
+
+    Args:
+        keys (Sequence[str]): Required keys to be collected.
+        meta_name (str): The name of the key that contains meta infomation.
+            This key is always populated. Default: "img_metas".
+        meta_keys (Sequence[str]): Keys that are collected under meta_name.
+            The contents of the ``meta_name`` dictionary depends on
+            ``meta_keys``.
+            By default this includes:
+
+            - "filename": path to the image file
+            - "label": label of the image file
+            - "original_shape": original shape of the image as a tuple
+                (h, w, c)
+            - "img_shape": shape of the image input to the network as a tuple
+                (h, w, c).  Note that images may be zero padded on the
+                bottom/right, if the batch tensor is larger than this shape.
+            - "pad_shape": image shape after padding
+            - "flip_direction": a str in ("horiziontal", "vertival") to
+                indicate if the image is fliped horizontally or vertically.
+            - "img_norm_cfg": a dict of normalization information:
+                - mean - per channel mean subtraction
+                - std - per channel std divisor
+                - to_rgb - bool indicating if bgr was converted to rgb
+        nested (bool): If set as True, will apply data[x] = [data[x]] to all
+            items in data. The arg is added for compatibility. Default: False.
+    """
+
+    def __init__(self,
+                 keys,
+                 meta_keys=('filename', 'label', 'original_shape', 'img_shape',
+                            'pad_shape', 'flip_direction', 'img_norm_cfg'),
+                 meta_name='img_metas'):
+        self.keys = keys
+        self.meta_keys = meta_keys
+        self.meta_name = meta_name
+
+    def __call__(self, results):
+        """Performs the Collect formating.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        data = []
+        for key in self.keys:
+            data.append(results[key])
+
+        if len(self.meta_keys) != 0:
+            meta = {}
+            for key in self.meta_keys:
+                meta[key] = results[key]
+            data.append(meta)
+
+        return data
+
+    def __repr__(self):
+        return (f'{self.__class__.__name__}('
+                f'keys={self.keys}, meta_keys={self.meta_keys}, '
+                f'nested={self.nested})')
+
+
+@PIPELINES.register()
+class GeneratePoseTarget:
+    """Generate pseudo heatmaps based on joint coordinates and confidence.
+
+    Required keys are "keypoint", "img_shape", "keypoint_score" (optional),
+    added or modified keys are "imgs".
+
+    Args:
+        sigma (float): The sigma of the generated gaussian map. Default: 0.6.
+        use_score (bool): Use the confidence score of keypoints as the maximum
+            of the gaussian maps. Default: True.
+        with_kp (bool): Generate pseudo heatmaps for keypoints. Default: True.
+        with_limb (bool): Generate pseudo heatmaps for limbs. At least one of
+            'with_kp' and 'with_limb' should be True. Default: False.
+        skeletons (tuple[tuple]): The definition of human skeletons.
+            Default: ((0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (5, 7), (7, 9),
+                      (0, 6), (6, 8), (8, 10), (5, 11), (11, 13), (13, 15),
+                      (6, 12), (12, 14), (14, 16), (11, 12)),
+            which is the definition of COCO-17p skeletons.
+        double (bool): Output both original heatmaps and flipped heatmaps.
+            Default: False.
+        left_kp (tuple[int]): Indexes of left keypoints, which is used when
+            flipping heatmaps. Default: (1, 3, 5, 7, 9, 11, 13, 15),
+            which is left keypoints in COCO-17p.
+        right_kp (tuple[int]): Indexes of right keypoints, which is used when
+            flipping heatmaps. Default: (2, 4, 6, 8, 10, 12, 14, 16),
+            which is right keypoints in COCO-17p.
+    """
+
+    def __init__(self,
+                 sigma=0.6,
+                 use_score=True,
+                 with_kp=True,
+                 with_limb=False,
+                 skeletons=((0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (5, 7),
+                            (7, 9), (0, 6), (6, 8), (8, 10), (5, 11), (11, 13),
+                            (13, 15), (6, 12), (12, 14), (14, 16), (11, 12)),
+                 double=False,
+                 left_kp=(1, 3, 5, 7, 9, 11, 13, 15),
+                 right_kp=(2, 4, 6, 8, 10, 12, 14, 16)):
+
+        self.sigma = sigma
+        self.use_score = use_score
+        self.with_kp = with_kp
+        self.with_limb = with_limb
+        self.double = double
+
+        # an auxiliary const
+        self.eps = 1e-4
+
+        assert self.with_kp or self.with_limb, (
+            'At least one of "with_limb" '
+            'and "with_kp" should be set as True.')
+        self.left_kp = left_kp
+        self.right_kp = right_kp
+        self.skeletons = skeletons
+
+    def generate_a_heatmap(self, img_h, img_w, centers, sigma, max_values):
+        """Generate pseudo heatmap for one keypoint in one frame.
+
+        Args:
+            img_h (int): The height of the heatmap.
+            img_w (int): The width of the heatmap.
+            centers (np.ndarray): The coordinates of corresponding keypoints
+                (of multiple persons).
+            sigma (float): The sigma of generated gaussian.
+            max_values (np.ndarray): The max values of each keypoint.
+
+        Returns:
+            np.ndarray: The generated pseudo heatmap.
+        """
+
+        heatmap = np.zeros([img_h, img_w], dtype=np.float32)
+
+        for center, max_value in zip(centers, max_values):
+            mu_x, mu_y = center[0], center[1]
+            if max_value < self.eps:
+                continue
+
+            st_x = max(int(mu_x - 3 * sigma), 0)
+            ed_x = min(int(mu_x + 3 * sigma) + 1, img_w)
+            st_y = max(int(mu_y - 3 * sigma), 0)
+            ed_y = min(int(mu_y + 3 * sigma) + 1, img_h)
+            x = np.arange(st_x, ed_x, 1, np.float32)
+            y = np.arange(st_y, ed_y, 1, np.float32)
+
+            # if the keypoint not in the heatmap coordinate system
+            if not (len(x) and len(y)):
+                continue
+            y = y[:, None]
+
+            patch = np.exp(-((x - mu_x)**2 + (y - mu_y)**2) / 2 / sigma**2)
+            patch = patch * max_value
+            heatmap[st_y:ed_y, st_x:ed_x] = np.maximum(
+                heatmap[st_y:ed_y, st_x:ed_x], patch)
+
+        return heatmap
+
+    def generate_a_limb_heatmap(self, img_h, img_w, starts, ends, sigma,
+                                start_values, end_values):
+        """Generate pseudo heatmap for one limb in one frame.
+
+        Args:
+            img_h (int): The height of the heatmap.
+            img_w (int): The width of the heatmap.
+            starts (np.ndarray): The coordinates of one keypoint in the
+                corresponding limbs (of multiple persons).
+            ends (np.ndarray): The coordinates of the other keypoint in the
+                corresponding limbs (of multiple persons).
+            sigma (float): The sigma of generated gaussian.
+            start_values (np.ndarray): The max values of one keypoint in the
+                corresponding limbs.
+            end_values (np.ndarray): The max values of the other keypoint in
+                the corresponding limbs.
+
+        Returns:
+            np.ndarray: The generated pseudo heatmap.
+        """
+
+        heatmap = np.zeros([img_h, img_w], dtype=np.float32)
+
+        for start, end, start_value, end_value in zip(starts, ends,
+                                                      start_values, end_values):
+            value_coeff = min(start_value, end_value)
+            if value_coeff < self.eps:
+                continue
+
+            min_x, max_x = min(start[0], end[0]), max(start[0], end[0])
+            min_y, max_y = min(start[1], end[1]), max(start[1], end[1])
+
+            min_x = max(int(min_x - 3 * sigma), 0)
+            max_x = min(int(max_x + 3 * sigma) + 1, img_w)
+            min_y = max(int(min_y - 3 * sigma), 0)
+            max_y = min(int(max_y + 3 * sigma) + 1, img_h)
+
+            x = np.arange(min_x, max_x, 1, np.float32)
+            y = np.arange(min_y, max_y, 1, np.float32)
+
+            if not (len(x) and len(y)):
+                continue
+
+            y = y[:, None]
+            x_0 = np.zeros_like(x)
+            y_0 = np.zeros_like(y)
+
+            # distance to start keypoints
+            d2_start = ((x - start[0])**2 + (y - start[1])**2)
+
+            # distance to end keypoints
+            d2_end = ((x - end[0])**2 + (y - end[1])**2)
+
+            # the distance between start and end keypoints.
+            d2_ab = ((start[0] - end[0])**2 + (start[1] - end[1])**2)
+
+            if d2_ab < 1:
+                full_map = self.generate_a_heatmap(img_h, img_w, [start], sigma,
+                                                   [start_value])
+                heatmap = np.maximum(heatmap, full_map)
+                continue
+
+            coeff = (d2_start - d2_end + d2_ab) / 2. / d2_ab
+
+            a_dominate = coeff <= 0
+            b_dominate = coeff >= 1
+            seg_dominate = 1 - a_dominate - b_dominate
+
+            position = np.stack([x + y_0, y + x_0], axis=-1)
+            projection = start + np.stack([coeff, coeff],
+                                          axis=-1) * (end - start)
+            d2_line = position - projection
+            d2_line = d2_line[:, :, 0]**2 + d2_line[:, :, 1]**2
+            d2_seg = (a_dominate * d2_start + b_dominate * d2_end +
+                      seg_dominate * d2_line)
+
+            patch = np.exp(-d2_seg / 2. / sigma**2)
+            patch = patch * value_coeff
+
+            heatmap[min_y:max_y, min_x:max_x] = np.maximum(
+                heatmap[min_y:max_y, min_x:max_x], patch)
+
+        return heatmap
+
+    def generate_heatmap(self, img_h, img_w, kps, sigma, max_values):
+        """Generate pseudo heatmap for all keypoints and limbs in one frame (if
+        needed).
+
+        Args:
+            img_h (int): The height of the heatmap.
+            img_w (int): The width of the heatmap.
+            kps (np.ndarray): The coordinates of keypoints in this frame.
+            sigma (float): The sigma of generated gaussian.
+            max_values (np.ndarray): The confidence score of each keypoint.
+
+        Returns:
+            np.ndarray: The generated pseudo heatmap.
+        """
+
+        heatmaps = []
+        if self.with_kp:
+            num_kp = kps.shape[1]
+            for i in range(num_kp):
+                heatmap = self.generate_a_heatmap(img_h, img_w, kps[:, i],
+                                                  sigma, max_values[:, i])
+                heatmaps.append(heatmap)
+
+        if self.with_limb:
+            for limb in self.skeletons:
+                start_idx, end_idx = limb
+                starts = kps[:, start_idx]
+                ends = kps[:, end_idx]
+
+                start_values = max_values[:, start_idx]
+                end_values = max_values[:, end_idx]
+                heatmap = self.generate_a_limb_heatmap(
+                    img_h, img_w, starts, ends, sigma, start_values, end_values)
+                heatmaps.append(heatmap)
+
+        return np.stack(heatmaps, axis=-1)
+
+    def gen_an_aug(self, results):
+        """Generate pseudo heatmaps for all frames.
+
+        Args:
+            results (dict): The dictionary that contains all info of a sample.
+
+        Returns:
+            list[np.ndarray]: The generated pseudo heatmaps.
+        """
+
+        all_kps = results['keypoint']
+        kp_shape = all_kps.shape
+
+        if 'keypoint_score' in results:
+            all_kpscores = results['keypoint_score']
+        else:
+            all_kpscores = np.ones(kp_shape[:-1], dtype=np.float32)
+
+        img_h, img_w = results['img_shape']
+        num_frame = kp_shape[1]
+
+        imgs = []
+        for i in range(num_frame):
+            sigma = self.sigma
+            kps = all_kps[:, i]
+            kpscores = all_kpscores[:, i]
+
+            max_values = np.ones(kpscores.shape, dtype=np.float32)
+            if self.use_score:
+                max_values = kpscores
+
+            hmap = self.generate_heatmap(img_h, img_w, kps, sigma, max_values)
+            imgs.append(hmap)
+
+        return imgs
+
+    def __call__(self, results):
+        if not self.double:
+            results['imgs'] = np.stack(self.gen_an_aug(results))
+        else:
+            results_ = cp.deepcopy(results)
+            flip = Flip_V2(
+                flip_ratio=1, left_kp=self.left_kp, right_kp=self.right_kp)
+            results_ = flip(results_)
+            results['imgs'] = np.concatenate(
+                [self.gen_an_aug(results),
+                 self.gen_an_aug(results_)])
+        results['label'] = np.array([results['label']])
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'sigma={self.sigma}, '
+                    f'use_score={self.use_score}, '
+                    f'with_kp={self.with_kp}, '
+                    f'with_limb={self.with_limb}, '
+                    f'skeletons={self.skeletons}, '
+                    f'double={self.double}, '
+                    f'left_kp={self.left_kp}, '
+                    f'right_kp={self.right_kp})')
+        return repr_str
diff --git a/Bank_second_part/detect_process/paddlevideo/loader/registry.py b/Bank_second_part/detect_process/paddlevideo/loader/registry.py
new file mode 100644
index 0000000..add6631
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/loader/registry.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils import Registry
+
+PIPELINES = Registry("pipeline")
+DATASETS = Registry("datasets")
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/__init__.py b/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/__init__.py
new file mode 100644
index 0000000..eefabbd
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/__init__.py
@@ -0,0 +1,3 @@
+from .anet_prop import ANETproposal
+
+__all__ = ['ANETproposal']
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..c5a7b5f
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/__pycache__/anet_prop.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/__pycache__/anet_prop.cpython-310.pyc
new file mode 100644
index 0000000..5987d5b
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/__pycache__/anet_prop.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/anet_prop.py b/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/anet_prop.py
new file mode 100644
index 0000000..411b164
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/anet_prop.py
@@ -0,0 +1,359 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import json
+import numpy as np
+import pandas as pd
+import urllib.request as urllib2
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+class ANETproposal(object):
+    """
+    This class is used for calculating AR@N and AUC;
+    Code transfer from ActivityNet Gitub repository](https://github.com/activitynet/ActivityNet.git)
+    """
+    GROUND_TRUTH_FIELDS = ['database', 'taxonomy', 'version']
+    PROPOSAL_FIELDS = ['results', 'version', 'external_data']
+    API = 'http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/challenge19/api.py'
+
+    def __init__(self,
+                 ground_truth_filename=None,
+                 proposal_filename=None,
+                 ground_truth_fields=GROUND_TRUTH_FIELDS,
+                 proposal_fields=PROPOSAL_FIELDS,
+                 tiou_thresholds=np.linspace(0.5, 0.95, 10),
+                 max_avg_nr_proposals=None,
+                 subset='validation',
+                 verbose=False,
+                 check_status=True):
+        if not ground_truth_filename:
+            raise IOError('Please input a valid ground truth file.')
+        if not proposal_filename:
+            raise IOError('Please input a valid proposal file.')
+        self.subset = subset
+        self.tiou_thresholds = tiou_thresholds
+        self.max_avg_nr_proposals = max_avg_nr_proposals
+        self.verbose = verbose
+        self.gt_fields = ground_truth_fields
+        self.pred_fields = proposal_fields
+        self.recall = None
+        self.avg_recall = None
+        self.proposals_per_video = None
+        self.check_status = check_status
+        # Retrieve blocked videos from server.
+        if self.check_status:
+            self.blocked_videos = self.get_blocked_videos()
+        else:
+            self.blocked_videos = list()
+        # Import ground truth and proposals.
+        self.ground_truth, self.activity_index = self._import_ground_truth(
+            ground_truth_filename)
+        self.proposal = self._import_proposal(proposal_filename)
+
+        if self.verbose:
+            print('[INIT] Loaded annotations from {} subset.'.format(subset))
+            nr_gt = len(self.ground_truth)
+            print('\tNumber of ground truth instances: {}'.format(nr_gt))
+            nr_pred = len(self.proposal)
+            print('\tNumber of proposals: {}'.format(nr_pred))
+            print('\tFixed threshold for tiou score: {}'.format(
+                self.tiou_thresholds))
+
+    def _import_ground_truth(self, ground_truth_filename):
+        """
+        Reads ground truth file, checks if it is well formatted, and returns
+        the ground truth instances and the activity classes.
+
+        Parameters:
+        ground_truth_filename (str): full path to the ground truth json file.
+        Returns:
+        ground_truth (df): Data frame containing the ground truth instances.
+        activity_index (dict): Dictionary containing class index.
+        """
+        with open(ground_truth_filename, 'r') as fobj:
+            data = json.load(fobj)
+        # Checking format
+        if not all([field in data.keys() for field in self.gt_fields]):
+            raise IOError('Please input a valid ground truth file.')
+
+        # Read ground truth data.
+        activity_index, cidx = {}, 0
+        video_lst, t_start_lst, t_end_lst, label_lst = [], [], [], []
+        for videoid, v in data['database'].items():
+            if self.subset != v['subset']:
+                continue
+            if videoid in self.blocked_videos:
+                continue
+            for ann in v['annotations']:
+                if ann['label'] not in activity_index:
+                    activity_index[ann['label']] = cidx
+                    cidx += 1
+                video_lst.append(videoid)
+                t_start_lst.append(float(ann['segment'][0]))
+                t_end_lst.append(float(ann['segment'][1]))
+                label_lst.append(activity_index[ann['label']])
+
+        ground_truth = pd.DataFrame({
+            'video-id': video_lst,
+            't-start': t_start_lst,
+            't-end': t_end_lst,
+            'label': label_lst
+        })
+        return ground_truth, activity_index
+
+    def _import_proposal(self, proposal_filename):
+        """
+        Reads proposal file, checks if it is well formatted, and returns
+        the proposal instances.
+
+        Parameters:
+        proposal_filename (str): Full path to the proposal json file.
+        Returns:
+        proposal (df): Data frame containing the proposal instances.
+        """
+        with open(proposal_filename, 'r') as fobj:
+            data = json.load(fobj)
+        # Checking format...
+        if not all([field in data.keys() for field in self.pred_fields]):
+            raise IOError('Please input a valid proposal file.')
+
+        # Read predictions.
+        video_lst, t_start_lst, t_end_lst = [], [], []
+        score_lst = []
+        for videoid, v in data['results'].items():
+            if videoid in self.blocked_videos:
+                continue
+            for result in v:
+                video_lst.append(videoid)
+                t_start_lst.append(float(result['segment'][0]))
+                t_end_lst.append(float(result['segment'][1]))
+                score_lst.append(result['score'])
+        proposal = pd.DataFrame({
+            'video-id': video_lst,
+            't-start': t_start_lst,
+            't-end': t_end_lst,
+            'score': score_lst
+        })
+        return proposal
+
+    def evaluate(self):
+        """
+        Evaluates a proposal file. To measure the performance of a
+        method for the proposal task, we computes the area under the
+        average recall vs average number of proposals per video curve.
+        """
+        recall, avg_recall, proposals_per_video = self.average_recall_vs_avg_nr_proposals(
+            self.ground_truth,
+            self.proposal,
+            max_avg_nr_proposals=self.max_avg_nr_proposals,
+            tiou_thresholds=self.tiou_thresholds)
+
+        area_under_curve = np.trapz(avg_recall, proposals_per_video)
+
+        if self.verbose:
+            print('[RESULTS] Performance on ActivityNet proposal task.')
+            with open("data/bmn/BMN_Test_results/auc_result.txt",
+                      "a") as text_file:
+                text_file.write(
+                    '\tArea Under the AR vs AN curve: {}% \n'.format(
+                        100. * float(area_under_curve) /
+                        proposals_per_video[-1]))
+            print('\tArea Under the AR vs AN curve: {}%'.format(
+                100. * float(area_under_curve) / proposals_per_video[-1]))
+
+        self.recall = recall
+        self.avg_recall = avg_recall
+        self.proposals_per_video = proposals_per_video
+
+    def average_recall_vs_avg_nr_proposals(self,
+                                           ground_truth,
+                                           proposals,
+                                           max_avg_nr_proposals=None,
+                                           tiou_thresholds=np.linspace(
+                                               0.5, 0.95, 10)):
+        """
+        Computes the average recall given an average number of
+        proposals per video.
+
+        Parameters:
+        ground_truth(df): Data frame containing the ground truth instances.
+            Required fields: ['video-id', 't-start', 't-end']
+        proposal(df): Data frame containing the proposal instances.
+            Required fields: ['video-id, 't-start', 't-end', 'score']
+        tiou_thresholds(1d-array | optional): array with tiou thresholds.
+
+        Returns:
+        recall(2d-array): recall[i,j] is recall at ith tiou threshold at the jth
+            average number of average number of proposals per video.
+        average_recall(1d-array): recall averaged over a list of tiou threshold.
+            This is equivalent to recall.mean(axis=0).
+        proposals_per_video(1d-array): average number of proposals per video.
+        """
+
+        # Get list of videos.
+        video_lst = ground_truth['video-id'].unique()
+
+        if not max_avg_nr_proposals:
+            max_avg_nr_proposals = float(
+                proposals.shape[0]) / video_lst.shape[0]
+
+        ratio = max_avg_nr_proposals * float(
+            video_lst.shape[0]) / proposals.shape[0]
+
+        # Adaptation to query faster
+        ground_truth_gbvn = ground_truth.groupby('video-id')
+        proposals_gbvn = proposals.groupby('video-id')
+
+        # For each video, computes tiou scores among the retrieved proposals.
+        score_lst = []
+        total_nr_proposals = 0
+        for videoid in video_lst:
+            # Get ground-truth instances associated to this video.
+            ground_truth_videoid = ground_truth_gbvn.get_group(videoid)
+            this_video_ground_truth = ground_truth_videoid.loc[:, [
+                't-start', 't-end'
+            ]].values
+
+            # Get proposals for this video.
+            try:
+                proposals_videoid = proposals_gbvn.get_group(videoid)
+            except:
+                n = this_video_ground_truth.shape[0]
+                score_lst.append(np.zeros((n, 1)))
+                continue
+
+            this_video_proposals = proposals_videoid.loc[:,
+                                                         ['t-start', 't-end'
+                                                          ]].values
+
+            if this_video_proposals.shape[0] == 0:
+                n = this_video_ground_truth.shape[0]
+                score_lst.append(np.zeros((n, 1)))
+                continue
+
+            # Sort proposals by score.
+            sort_idx = proposals_videoid['score'].argsort()[::-1]
+            this_video_proposals = this_video_proposals[sort_idx, :]
+
+            if this_video_proposals.ndim != 2:
+                this_video_proposals = np.expand_dims(this_video_proposals,
+                                                      axis=0)
+            if this_video_ground_truth.ndim != 2:
+                this_video_ground_truth = np.expand_dims(
+                    this_video_ground_truth, axis=0)
+
+            nr_proposals = np.minimum(
+                int(this_video_proposals.shape[0] * ratio),
+                this_video_proposals.shape[0])
+            total_nr_proposals += nr_proposals
+            this_video_proposals = this_video_proposals[:nr_proposals, :]
+
+            # Compute tiou scores.
+            tiou = self.wrapper_segment_iou(this_video_proposals,
+                                            this_video_ground_truth)
+            score_lst.append(tiou)
+
+        # Given that the length of the videos is really varied, we
+        # compute the number of proposals in terms of a ratio of the total
+        # proposals retrieved, i.e. average recall at a percentage of proposals
+        # retrieved per video.
+
+        # Computes average recall.
+        pcn_lst = np.arange(1, 101) / 100.0 * (max_avg_nr_proposals * float(
+            video_lst.shape[0]) / total_nr_proposals)
+        matches = np.empty((video_lst.shape[0], pcn_lst.shape[0]))
+        positives = np.empty(video_lst.shape[0])
+        recall = np.empty((tiou_thresholds.shape[0], pcn_lst.shape[0]))
+        # Iterates over each tiou threshold.
+        for ridx, tiou in enumerate(tiou_thresholds):
+
+            # Inspect positives retrieved per video at different
+            # number of proposals (percentage of the total retrieved).
+            for i, score in enumerate(score_lst):
+                # Total positives per video.
+                positives[i] = score.shape[0]
+                # Find proposals that satisfies minimum tiou threshold.
+                true_positives_tiou = score >= tiou
+                # Get number of proposals as a percentage of total retrieved.
+                pcn_proposals = np.minimum(
+                    (score.shape[1] * pcn_lst).astype(int), score.shape[1])
+
+                for j, nr_proposals in enumerate(pcn_proposals):
+                    # Compute the number of matches for each percentage of the proposals
+                    matches[i, j] = np.count_nonzero(
+                        (true_positives_tiou[:, :nr_proposals]).sum(axis=1))
+
+            # Computes recall given the set of matches per video.
+            recall[ridx, :] = matches.sum(axis=0) / positives.sum()
+
+        # Recall is averaged.
+        avg_recall = recall.mean(axis=0)
+
+        # Get the average number of proposals per video.
+        proposals_per_video = pcn_lst * (float(total_nr_proposals) /
+                                         video_lst.shape[0])
+
+        return recall, avg_recall, proposals_per_video
+
+    def get_blocked_videos(self, api=API):
+        api_url = '{}?action=get_blocked'.format(api)
+        req = urllib2.Request(api_url)
+        response = urllib2.urlopen(req)
+        return json.loads(response.read())
+
+    def wrapper_segment_iou(self, target_segments, candidate_segments):
+        """
+        Compute intersection over union btw segments
+        Parameters:
+        target_segments(nd-array): 2-dim array in format [m x 2:=[init, end]]
+        candidate_segments(nd-array): 2-dim array in format [n x 2:=[init, end]]
+        Returns:
+        tiou(nd-array): 2-dim array [n x m] with IOU ratio.
+        Note: It assumes that candidate-segments are more scarce that target-segments
+        """
+        if candidate_segments.ndim != 2 or target_segments.ndim != 2:
+            raise ValueError('Dimension of arguments is incorrect')
+
+        n, m = candidate_segments.shape[0], target_segments.shape[0]
+        tiou = np.empty((n, m))
+        for i in range(m):
+            tiou[:, i] = self.segment_iou(target_segments[i, :],
+                                          candidate_segments)
+
+        return tiou
+
+    def segment_iou(self, target_segment, candidate_segments):
+        """
+        Compute the temporal intersection over union between a
+        target segment and all the test segments.
+
+        Parameters:
+        target_segment(1d-array): Temporal target segment containing [starting, ending] times.
+        candidate_segments(2d-array): Temporal candidate segments containing N x [starting, ending] times.
+
+        Returns:
+        tiou(1d-array): Temporal intersection over union score of the N's candidate segments.
+        """
+        tt1 = np.maximum(target_segment[0], candidate_segments[:, 0])
+        tt2 = np.minimum(target_segment[1], candidate_segments[:, 1])
+        # Intersection including Non-negative overlap score.
+        segments_intersection = (tt2 - tt1).clip(0)
+        # Segment union.
+        segments_union = (candidate_segments[:, 1] - candidate_segments[:, 0]) \
+                         + (target_segment[1] - target_segment[0]) - segments_intersection
+        # Compute overlap as the ratio of the intersection
+        # over union of two segments.
+        tIoU = segments_intersection.astype(float) / segments_union
+        return tIoU
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__init__.py b/Bank_second_part/detect_process/paddlevideo/metrics/__init__.py
new file mode 100644
index 0000000..b693b87
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/__init__.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .bmn_metric import BMNMetric
+from .build import build_metric
+from .center_crop_metric import CenterCropMetric
+from .depth_metric import DepthMetric
+from .msrvtt_metric import MSRVTTMetric
+from .multi_crop_metric import MultiCropMetric
+from .registry import METRIC
+from .skeleton_metric import SkeletonMetric
+from .transnetv2_metric import TransNetV2Metric
+from .youtube8m.eval_util import HitOneMetric
+from .segmentation_metric import SegmentationMetric
+from .ava_metric import AVAMetric
+from .vos_metric import VOSMetric
+from .center_crop_metric_MRI import CenterCropMetric_MRI
+from .yowo_metric import YOWOMetric
+
+__all__ = [
+    'METRIC', 'build_metric', 'MultiCropMetric', 'BMNMetric',
+    'CenterCropMetric', 'SkeletonMetric', 'HitOneMetric', 'TransNetV2Metric',
+    'DepthMetric', 'MSRVTTMetric', 'VOSMetric', 'CenterCropMetric_MRI','AVAMetric',
+    'SegmentationMetric', 'YOWOMetric'
+]
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..9629b72
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/ava_metric.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/ava_metric.cpython-310.pyc
new file mode 100644
index 0000000..48d8818
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/ava_metric.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/ava_utils.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/ava_utils.cpython-310.pyc
new file mode 100644
index 0000000..2d80209
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/ava_utils.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000..0e4c617
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/base.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/bmn_metric.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/bmn_metric.cpython-310.pyc
new file mode 100644
index 0000000..9ea8d55
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/bmn_metric.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/build.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/build.cpython-310.pyc
new file mode 100644
index 0000000..a53f868
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/build.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/center_crop_metric.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/center_crop_metric.cpython-310.pyc
new file mode 100644
index 0000000..bfc8299
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/center_crop_metric.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/center_crop_metric_MRI.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/center_crop_metric_MRI.cpython-310.pyc
new file mode 100644
index 0000000..8b1974f
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/center_crop_metric_MRI.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/depth_metric.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/depth_metric.cpython-310.pyc
new file mode 100644
index 0000000..cb68274
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/depth_metric.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/msrvtt_metric.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/msrvtt_metric.cpython-310.pyc
new file mode 100644
index 0000000..c1574d9
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/msrvtt_metric.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/multi_crop_metric.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/multi_crop_metric.cpython-310.pyc
new file mode 100644
index 0000000..66d2ac2
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/multi_crop_metric.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/recall.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/recall.cpython-310.pyc
new file mode 100644
index 0000000..e228e1d
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/recall.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/registry.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/registry.cpython-310.pyc
new file mode 100644
index 0000000..6563d14
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/registry.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/segmentation_metric.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/segmentation_metric.cpython-310.pyc
new file mode 100644
index 0000000..f2b2e3f
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/segmentation_metric.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/skeleton_metric.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/skeleton_metric.cpython-310.pyc
new file mode 100644
index 0000000..10e12cf
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/skeleton_metric.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/transnetv2_metric.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/transnetv2_metric.cpython-310.pyc
new file mode 100644
index 0000000..562d0fe
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/transnetv2_metric.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/ucf24_utils.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/ucf24_utils.cpython-310.pyc
new file mode 100644
index 0000000..3597d80
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/ucf24_utils.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/vos_metric.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/vos_metric.cpython-310.pyc
new file mode 100644
index 0000000..004dc17
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/vos_metric.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/yowo_metric.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/yowo_metric.cpython-310.pyc
new file mode 100644
index 0000000..3815b7d
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/yowo_metric.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/README.md b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/README.md
new file mode 100644
index 0000000..7414d0f
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/README.md
@@ -0,0 +1,2 @@
+The code under this folder is from the official [ActivityNet repo](https://github.com/activitynet/ActivityNet).
+Some unused codes are removed to minimize the length of codes added.
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__init__.py b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..34f17ef
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/metrics.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/metrics.cpython-310.pyc
new file mode 100644
index 0000000..39f6227
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/metrics.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/np_box_list.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/np_box_list.cpython-310.pyc
new file mode 100644
index 0000000..24a0843
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/np_box_list.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/np_box_ops.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/np_box_ops.cpython-310.pyc
new file mode 100644
index 0000000..7df2341
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/np_box_ops.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/object_detection_evaluation.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/object_detection_evaluation.cpython-310.pyc
new file mode 100644
index 0000000..40abd9e
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/object_detection_evaluation.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/per_image_evaluation.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/per_image_evaluation.cpython-310.pyc
new file mode 100644
index 0000000..a74a9bb
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/per_image_evaluation.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/standard_fields.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/standard_fields.cpython-310.pyc
new file mode 100644
index 0000000..c0609b8
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/standard_fields.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/metrics.py b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/metrics.py
new file mode 100644
index 0000000..13eb034
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/metrics.py
@@ -0,0 +1,143 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Functions for computing metrics like precision, recall, CorLoc and etc."""
+
+import numpy as np
+
+
+def compute_precision_recall(scores, labels, num_gt):
+    """Compute precision and recall.
+
+    Args:
+        scores: A float numpy array representing detection score
+        labels: A boolean numpy array representing true/false positive labels
+        num_gt: Number of ground truth instances
+
+    Raises:
+        ValueError: if the input is not of the correct format
+
+    Returns:
+        precision: Fraction of positive instances over detected ones. This
+            value is None if no ground truth labels are present.
+        recall: Fraction of detected positive instance over all positive
+            instances. This value is None if no ground truth labels are
+            present.
+    """
+    if (not isinstance(labels, np.ndarray) or labels.dtype != np.bool
+            or len(labels.shape) != 1):
+        raise ValueError('labels must be single dimension bool numpy array')
+
+    if not isinstance(scores, np.ndarray) or len(scores.shape) != 1:
+        raise ValueError('scores must be single dimension numpy array')
+
+    if num_gt < np.sum(labels):
+        raise ValueError(
+            'Number of true positives must be smaller than num_gt.')
+
+    if len(scores) != len(labels):
+        raise ValueError('scores and labels must be of the same size.')
+
+    if num_gt == 0:
+        return None, None
+
+    sorted_indices = np.argsort(scores)
+    sorted_indices = sorted_indices[::-1]
+    labels = labels.astype(int)
+    true_positive_labels = labels[sorted_indices]
+    false_positive_labels = 1 - true_positive_labels
+    cum_true_positives = np.cumsum(true_positive_labels)
+    cum_false_positives = np.cumsum(false_positive_labels)
+    precision = cum_true_positives.astype(float) / (
+        cum_true_positives + cum_false_positives)
+    recall = cum_true_positives.astype(float) / num_gt
+    return precision, recall
+
+
+def compute_average_precision(precision, recall):
+    """Compute Average Precision according to the definition in VOCdevkit.
+
+    Precision is modified to ensure that it does not decrease as recall
+    decrease.
+
+    Args:
+        precision: A float [N, 1] numpy array of precisions
+        recall: A float [N, 1] numpy array of recalls
+
+    Raises:
+        ValueError: if the input is not of the correct format
+
+    Returns:
+        average_precison: The area under the precision recall curve. NaN if
+            precision and recall are None.
+    """
+    if precision is None:
+        if recall is not None:
+            raise ValueError('If precision is None, recall must also be None')
+        return np.NAN
+
+    if not isinstance(precision, np.ndarray) or not isinstance(
+            recall, np.ndarray):
+        raise ValueError('precision and recall must be numpy array')
+    if precision.dtype != np.float or recall.dtype != np.float:
+        raise ValueError('input must be float numpy array.')
+    if len(precision) != len(recall):
+        raise ValueError('precision and recall must be of the same size.')
+    if not precision.size:
+        return 0.0
+    if np.amin(precision) < 0 or np.amax(precision) > 1:
+        raise ValueError('Precision must be in the range of [0, 1].')
+    if np.amin(recall) < 0 or np.amax(recall) > 1:
+        raise ValueError('recall must be in the range of [0, 1].')
+    if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)):
+        raise ValueError('recall must be a non-decreasing array')
+
+    recall = np.concatenate([[0], recall, [1]])
+    precision = np.concatenate([[0], precision, [0]])
+
+    # Preprocess precision to be a non-decreasing array
+    for i in range(len(precision) - 2, -1, -1):
+        precision[i] = np.maximum(precision[i], precision[i + 1])
+
+    indices = np.where(recall[1:] != recall[:-1])[0] + 1
+    average_precision = np.sum(
+        (recall[indices] - recall[indices - 1]) * precision[indices])
+    return average_precision
+
+
+def compute_cor_loc(num_gt_imgs_per_class,
+                    num_images_correctly_detected_per_class):
+    """Compute CorLoc according to the definition in the following paper.
+
+    https://www.robots.ox.ac.uk/~vgg/rg/papers/deselaers-eccv10.pdf
+
+    Returns nans if there are no ground truth images for a class.
+
+    Args:
+        num_gt_imgs_per_class: 1D array, representing number of images
+            containing at least one object instance of a particular class
+        num_images_correctly_detected_per_class: 1D array, representing number
+            of images that are correctly detected at least one object instance
+            of a particular class
+
+    Returns:
+        corloc_per_class: A float numpy array represents the corloc score of
+            each class
+    """
+    # Divide by zero expected for classes with no gt examples.
+    with np.errstate(divide='ignore', invalid='ignore'):
+        return np.where(
+            num_gt_imgs_per_class == 0, np.nan,
+            num_images_correctly_detected_per_class / num_gt_imgs_per_class)
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/np_box_list.py b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/np_box_list.py
new file mode 100644
index 0000000..f9b101e
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/np_box_list.py
@@ -0,0 +1,138 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Numpy BoxList classes and functions."""
+
+import numpy as np
+
+
+class BoxList:
+    """Box collection.
+
+    BoxList represents a list of bounding boxes as numpy array, where each
+    bounding box is represented as a row of 4 numbers,
+    [y_min, x_min, y_max, x_max].  It is assumed that all bounding boxes within
+    a given list correspond to a single image.
+
+    Optionally, users can add additional related fields (such as
+    objectness/classification scores).
+    """
+
+    def __init__(self, data):
+        """Constructs box collection.
+
+        Args:
+            data: a numpy array of shape [N, 4] representing box coordinates
+
+        Raises:
+            ValueError: if bbox data is not a numpy array
+            ValueError: if invalid dimensions for bbox data
+        """
+        if not isinstance(data, np.ndarray):
+            raise ValueError('data must be a numpy array.')
+        if len(data.shape) != 2 or data.shape[1] != 4:
+            raise ValueError('Invalid dimensions for box data.')
+        if data.dtype != np.float32 and data.dtype != np.float64:
+            raise ValueError(
+                'Invalid data type for box data: float is required.')
+        if not self._is_valid_boxes(data):
+            raise ValueError('Invalid box data. data must be a numpy array of '
+                             'N*[y_min, x_min, y_max, x_max]')
+        self.data = {'boxes': data}
+
+    def num_boxes(self):
+        """Return number of boxes held in collections."""
+        return self.data['boxes'].shape[0]
+
+    def get_extra_fields(self):
+        """Return all non-box fields."""
+        return [k for k in self.data if k != 'boxes']
+
+    def has_field(self, field):
+        return field in self.data
+
+    def add_field(self, field, field_data):
+        """Add data to a specified field.
+
+        Args:
+            field: a string parameter used to speficy a related field to be
+                accessed.
+            field_data: a numpy array of [N, ...] representing the data
+                associated with the field.
+        Raises:
+            ValueError: if the field is already exist or the dimension of the
+                field data does not matches the number of boxes.
+        """
+        if self.has_field(field):
+            raise ValueError('Field ' + field + 'already exists')
+        if len(field_data.shape) < 1 or field_data.shape[0] != self.num_boxes(
+        ):
+            raise ValueError('Invalid dimensions for field data')
+        self.data[field] = field_data
+
+    def get(self):
+        """Convenience function for accesssing box coordinates.
+
+        Returns:
+            a numpy array of shape [N, 4] representing box corners
+        """
+        return self.get_field('boxes')
+
+    def get_field(self, field):
+        """Accesses data associated with the specified field in the box
+        collection.
+
+        Args:
+            field: a string parameter used to speficy a related field to be
+                accessed.
+
+        Returns:
+            a numpy 1-d array representing data of an associated field
+
+        Raises:
+            ValueError: if invalid field
+        """
+        if not self.has_field(field):
+            raise ValueError(f'field {field} does not exist')
+        return self.data[field]
+
+    def get_coordinates(self):
+        """Get corner coordinates of boxes.
+
+        Returns:
+            a list of 4 1-d numpy arrays [y_min, x_min, y_max, x_max]
+        """
+        box_coordinates = self.get()
+        y_min = box_coordinates[:, 0]
+        x_min = box_coordinates[:, 1]
+        y_max = box_coordinates[:, 2]
+        x_max = box_coordinates[:, 3]
+        return [y_min, x_min, y_max, x_max]
+
+    def _is_valid_boxes(self, data):
+        """Check whether data fullfills the format of N*[ymin, xmin, ymax,
+        xmin].
+
+        Args:
+            data: a numpy array of shape [N, 4] representing box coordinates
+
+        Returns:
+            a boolean indicating whether all ymax of boxes are equal or greater
+            than ymin, and all xmax of boxes are equal or greater than xmin.
+        """
+        if len(data):
+            for v in data:
+                if v[0] > v[2] or v[1] > v[3]:
+                    return False
+        return True
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/np_box_ops.py b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/np_box_ops.py
new file mode 100644
index 0000000..94e7d30
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/np_box_ops.py
@@ -0,0 +1,98 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operations for [N, 4] numpy arrays representing bounding boxes.
+
+Example box operations that are supported:
+    * Areas: compute bounding box areas
+    * IOU: pairwise intersection-over-union scores
+"""
+
+import numpy as np
+
+
+def area(boxes):
+    """Computes area of boxes.
+
+    Args:
+        boxes: Numpy array with shape [N, 4] holding N boxes
+
+    Returns:
+        a numpy array with shape [N*1] representing box areas
+    """
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+def intersection(boxes1, boxes2):
+    """Compute pairwise intersection areas between boxes.
+
+    Args:
+        boxes1: a numpy array with shape [N, 4] holding N boxes
+        boxes2: a numpy array with shape [M, 4] holding M boxes
+
+    Returns:
+        a numpy array with shape [N*M] representing pairwise intersection area
+    """
+    [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1)
+    [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1)
+
+    all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2))
+    all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2))
+    intersect_heights = np.maximum(
+        np.zeros(all_pairs_max_ymin.shape),
+        all_pairs_min_ymax - all_pairs_max_ymin)
+    all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2))
+    all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2))
+    intersect_widths = np.maximum(
+        np.zeros(all_pairs_max_xmin.shape),
+        all_pairs_min_xmax - all_pairs_max_xmin)
+    return intersect_heights * intersect_widths
+
+
+def iou(boxes1, boxes2):
+    """Computes pairwise intersection-over-union between box collections.
+
+    Args:
+        boxes1: a numpy array with shape [N, 4] holding N boxes.
+        boxes2: a numpy array with shape [M, 4] holding N boxes.
+
+    Returns:
+        a numpy array with shape [N, M] representing pairwise iou scores.
+    """
+    intersect = intersection(boxes1, boxes2)
+    area1 = area(boxes1)
+    area2 = area(boxes2)
+    union = (
+        np.expand_dims(area1, axis=1) + np.expand_dims(area2, axis=0) -
+        intersect)
+    return intersect / union
+
+
+def ioa(boxes1, boxes2):
+    """Computes pairwise intersection-over-area between box collections.
+
+    Intersection-over-area (ioa) between two boxes box1 and box2 is defined as
+    their intersection area over box2's area. Note that ioa is not symmetric,
+    that is, IOA(box1, box2) != IOA(box2, box1).
+
+    Args:
+        boxes1: a numpy array with shape [N, 4] holding N boxes.
+        boxes2: a numpy array with shape [M, 4] holding N boxes.
+
+    Returns:
+        a numpy array with shape [N, M] representing pairwise ioa scores.
+    """
+    intersect = intersection(boxes1, boxes2)
+    areas = np.expand_dims(area(boxes2), axis=0)
+    return intersect / areas
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py
new file mode 100644
index 0000000..c9f0054
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py
@@ -0,0 +1,658 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""object_detection_evaluation module.
+
+ObjectDetectionEvaluation is a class which manages ground truth information of
+a object detection dataset, and computes frequently used detection metrics such
+as Precision, Recall, CorLoc of the provided detection results.
+It supports the following operations:
+1) Add ground truth information of images sequentially.
+2) Add detection result of images sequentially.
+3) Evaluate detection metrics on already inserted detection results.
+4) Write evaluation result into a pickle file for future processing or
+   visualization.
+
+Note: This module operates on numpy boxes and box lists.
+"""
+import collections
+import logging
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+
+from . import metrics, per_image_evaluation, standard_fields
+
+
+class DetectionEvaluator:
+    """Interface for object detection evalution classes.
+
+    Example usage of the Evaluator:
+    ------------------------------
+    evaluator = DetectionEvaluator(categories)
+
+    # Detections and groundtruth for image 1.
+    evaluator.add_single_groundtruth_image_info(...)
+    evaluator.add_single_detected_image_info(...)
+
+    # Detections and groundtruth for image 2.
+    evaluator.add_single_groundtruth_image_info(...)
+    evaluator.add_single_detected_image_info(...)
+
+    metrics_dict = evaluator.evaluate()
+    """
+
+    __metaclass__ = ABCMeta
+
+    def __init__(self, categories):
+        """Constructor.
+
+        Args:
+            categories: A list of dicts, each of which has the following keys -
+                'id': (required) an integer id uniquely identifying this
+                    category.
+                'name': (required) string representing category name e.g.,
+                    'cat', 'dog'.
+        """
+        self._categories = categories
+
+    @abstractmethod
+    def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):
+        """Adds groundtruth for a single image to be used for evaluation.
+
+        Args:
+            image_id: A unique string/integer identifier for the image.
+            groundtruth_dict: A dictionary of groundtruth numpy arrays required
+                for evaluations.
+        """
+
+    @abstractmethod
+    def add_single_detected_image_info(self, image_id, detections_dict):
+        """Adds detections for a single image to be used for evaluation.
+
+        Args:
+            image_id: A unique string/integer identifier for the image.
+            detections_dict: A dictionary of detection numpy arrays required
+                for evaluation.
+        """
+
+    @abstractmethod
+    def evaluate(self):
+        """Evaluates detections and returns a dictionary of metrics."""
+
+    @abstractmethod
+    def clear(self):
+        """Clears the state to prepare for a fresh evaluation."""
+
+
+class ObjectDetectionEvaluator(DetectionEvaluator):
+    """A class to evaluate detections."""
+
+    def __init__(
+        self,
+        categories,
+        matching_iou_threshold=0.5,
+        evaluate_corlocs=False,
+        metric_prefix=None,
+        use_weighted_mean_ap=False,
+        evaluate_masks=False,
+    ):
+        """Constructor.
+
+        Args:
+            categories: A list of dicts, each of which has the following keys -
+                'id': (required) an integer id uniquely identifying this
+                    category.
+                'name': (required) string representing category name e.g.,
+                    'cat', 'dog'.
+            matching_iou_threshold: IOU threshold to use for matching
+                groundtruth boxes to detection boxes.
+            evaluate_corlocs: (optional) boolean which determines if corloc
+                scores are to be returned or not.
+            metric_prefix: (optional) string prefix for metric name; if None,
+                no prefix is used.
+            use_weighted_mean_ap: (optional) boolean which determines if the
+                mean average precision is computed directly from the scores and
+                tp_fp_labels of all classes.
+            evaluate_masks: If False, evaluation will be performed based on
+                boxes. If True, mask evaluation will be performed instead.
+
+        Raises:
+            ValueError: If the category ids are not 1-indexed.
+        """
+        super(ObjectDetectionEvaluator, self).__init__(categories)
+        self._num_classes = max([cat['id'] for cat in categories])
+        if min(cat['id'] for cat in categories) < 1:
+            raise ValueError('Classes should be 1-indexed.')
+        self._matching_iou_threshold = matching_iou_threshold
+        self._use_weighted_mean_ap = use_weighted_mean_ap
+        self._label_id_offset = 1
+        self._evaluate_masks = evaluate_masks
+        self._evaluation = ObjectDetectionEvaluation(
+            num_groundtruth_classes=self._num_classes,
+            matching_iou_threshold=self._matching_iou_threshold,
+            use_weighted_mean_ap=self._use_weighted_mean_ap,
+            label_id_offset=self._label_id_offset,
+        )
+        self._image_ids = set([])
+        self._evaluate_corlocs = evaluate_corlocs
+        self._metric_prefix = (metric_prefix + '_') if metric_prefix else ''
+
+    def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):
+        """Adds groundtruth for a single image to be used for evaluation.
+
+        Args:
+            image_id: A unique string/integer identifier for the image.
+            groundtruth_dict: A dictionary containing -
+                standard_fields.InputDataFields.groundtruth_boxes: float32
+                    numpy array of shape [num_boxes, 4] containing `num_boxes`
+                    groundtruth boxes of the format [ymin, xmin, ymax, xmax] in
+                    absolute image coordinates.
+                standard_fields.InputDataFields.groundtruth_classes: integer
+                    numpy array of shape [num_boxes] containing 1-indexed
+                    groundtruth classes for the boxes.
+                standard_fields.InputDataFields.groundtruth_difficult: Optional
+                    length M numpy boolean array denoting whether a ground
+                    truth box is a difficult instance or not. This field is
+                    optional to support the case that no boxes are difficult.
+                standard_fields.InputDataFields.groundtruth_instance_masks:
+                    Optional numpy array of shape [num_boxes, height, width]
+                    with values in {0, 1}.
+
+        Raises:
+            ValueError: On adding groundtruth for an image more than once. Will
+                also raise error if instance masks are not in groundtruth
+                dictionary.
+        """
+        if image_id in self._image_ids:
+            raise ValueError(
+                'Image with id {} already added.'.format(image_id))
+
+        groundtruth_classes = (
+            groundtruth_dict[
+                standard_fields.InputDataFields.groundtruth_classes] -
+            self._label_id_offset)
+        # If the key is not present in the groundtruth_dict or the array is
+        # empty (unless there are no annotations for the groundtruth on this
+        # image) use values from the dictionary or insert None otherwise.
+        if (standard_fields.InputDataFields.groundtruth_difficult
+                in groundtruth_dict.keys()) and (groundtruth_dict[
+                    standard_fields.InputDataFields.groundtruth_difficult].size
+                                                 or
+                                                 not groundtruth_classes.size):
+            groundtruth_difficult = groundtruth_dict[
+                standard_fields.InputDataFields.groundtruth_difficult]
+        else:
+            groundtruth_difficult = None
+            if not len(self._image_ids) % 1000:
+                logging.warn(('image %s does not have groundtruth difficult '
+                              'flag specified'), image_id)
+        groundtruth_masks = None
+        if self._evaluate_masks:
+            if (standard_fields.InputDataFields.groundtruth_instance_masks
+                    not in groundtruth_dict):
+                raise ValueError(
+                    'Instance masks not in groundtruth dictionary.')
+            groundtruth_masks = groundtruth_dict[
+                standard_fields.InputDataFields.groundtruth_instance_masks]
+        self._evaluation.add_single_ground_truth_image_info(
+            image_key=image_id,
+            groundtruth_boxes=groundtruth_dict[
+                standard_fields.InputDataFields.groundtruth_boxes],
+            groundtruth_class_labels=groundtruth_classes,
+            groundtruth_is_difficult_list=groundtruth_difficult,
+            groundtruth_masks=groundtruth_masks,
+        )
+        self._image_ids.update([image_id])
+
+    def add_single_detected_image_info(self, image_id, detections_dict):
+        """Adds detections for a single image to be used for evaluation.
+
+        Args:
+            image_id: A unique string/integer identifier for the image.
+            detections_dict: A dictionary containing -
+                standard_fields.DetectionResultFields.detection_boxes: float32
+                    numpy array of shape [num_boxes, 4] containing `num_boxes`
+                    detection boxes of the format [ymin, xmin, ymax, xmax] in
+                    absolute image coordinates.
+                standard_fields.DetectionResultFields.detection_scores: float32
+                    numpy array of shape [num_boxes] containing detection
+                    scores for the boxes.
+                standard_fields.DetectionResultFields.detection_classes:
+                    integer numpy array of shape [num_boxes] containing
+                    1-indexed detection classes for the boxes.
+                standard_fields.DetectionResultFields.detection_masks: uint8
+                    numpy array of shape [num_boxes, height, width] containing
+                    `num_boxes` masks of values ranging between 0 and 1.
+
+        Raises:
+            ValueError: If detection masks are not in detections dictionary.
+        """
+        detection_classes = (
+            detections_dict[
+                standard_fields.DetectionResultFields.detection_classes] -
+            self._label_id_offset)
+        detection_masks = None
+        if self._evaluate_masks:
+            if (standard_fields.DetectionResultFields.detection_masks
+                    not in detections_dict):
+                raise ValueError(
+                    'Detection masks not in detections dictionary.')
+            detection_masks = detections_dict[
+                standard_fields.DetectionResultFields.detection_masks]
+        self._evaluation.add_single_detected_image_info(
+            image_key=image_id,
+            detected_boxes=detections_dict[
+                standard_fields.DetectionResultFields.detection_boxes],
+            detected_scores=detections_dict[
+                standard_fields.DetectionResultFields.detection_scores],
+            detected_class_labels=detection_classes,
+            detected_masks=detection_masks,
+        )
+
+    def create_category_index(self, categories):
+        """Creates dictionary of COCO compatible categories keyed by category
+        id.
+
+        Args:
+            categories: a list of dicts, each of which has the following keys:
+                'id': (required) an integer id uniquely identifying this
+                    category.
+                'name': (required) string representing category name
+                    e.g., 'cat', 'dog', 'pizza'.
+
+        Returns:
+            category_index: a dict containing the same entries as categories,
+                but keyed by the 'id' field of each category.
+        """
+        category_index = {}
+        for cat in categories:
+            category_index[cat['id']] = cat
+        return category_index
+
+    def evaluate(self):
+        """Compute evaluation result.
+
+        Returns:
+            A dictionary of metrics with the following fields -
+
+            1. summary_metrics:
+                'Precision/mAP@<matching_iou_threshold>IOU': mean average
+                precision at the specified IOU threshold
+
+            2. per_category_ap: category specific results with keys of the form
+               'PerformanceByCategory/mAP@<matching_iou_threshold>IOU/category'
+        """
+        (
+            per_class_ap,
+            mean_ap,
+            _,
+            _,
+            per_class_corloc,
+            mean_corloc,
+        ) = self._evaluation.evaluate()
+
+        metric = f'mAP@{self._matching_iou_threshold}IOU'
+        pascal_metrics = {self._metric_prefix + metric: mean_ap}
+        if self._evaluate_corlocs:
+            pascal_metrics[self._metric_prefix +
+                           'Precision/meanCorLoc@{}IOU'.format(
+                               self._matching_iou_threshold)] = mean_corloc
+        category_index = self.create_category_index(self._categories)
+        for idx in range(per_class_ap.size):
+            if idx + self._label_id_offset in category_index:
+                display_name = (
+                    self._metric_prefix +
+                    'PerformanceByCategory/AP@{}IOU/{}'.format(
+                        self._matching_iou_threshold,
+                        category_index[idx + self._label_id_offset]['name'],
+                    ))
+                pascal_metrics[display_name] = per_class_ap[idx]
+
+                # Optionally add CorLoc metrics.classes
+                if self._evaluate_corlocs: #False
+                    display_name = (
+                        self._metric_prefix +
+                        'PerformanceByCategory/CorLoc@{}IOU/{}'.format(
+                            self._matching_iou_threshold,
+                            category_index[idx +
+                                           self._label_id_offset]['name'],
+                        ))
+                    pascal_metrics[display_name] = per_class_corloc[idx]
+
+        return pascal_metrics
+
+    def clear(self):
+        """Clears the state to prepare for a fresh evaluation."""
+        self._evaluation = ObjectDetectionEvaluation(
+            num_groundtruth_classes=self._num_classes,
+            matching_iou_threshold=self._matching_iou_threshold,
+            use_weighted_mean_ap=self._use_weighted_mean_ap,
+            label_id_offset=self._label_id_offset,
+        )
+        self._image_ids.clear()
+
+
+class PascalDetectionEvaluator(ObjectDetectionEvaluator):
+    """A class to evaluate detections using PASCAL metrics."""
+
+    def __init__(self, categories, matching_iou_threshold=0.5):
+        super(PascalDetectionEvaluator, self).__init__(
+            categories,
+            matching_iou_threshold=matching_iou_threshold,
+            evaluate_corlocs=False,
+            use_weighted_mean_ap=False,
+        )
+
+
+ObjectDetectionEvalMetrics = collections.namedtuple(
+    'ObjectDetectionEvalMetrics',
+    [
+        'average_precisions',
+        'mean_ap',
+        'precisions',
+        'recalls',
+        'corlocs',
+        'mean_corloc',
+    ],
+)
+
+
+class ObjectDetectionEvaluation:
+    """Internal implementation of Pascal object detection metrics."""
+
+    def __init__(
+        self,
+        num_groundtruth_classes,
+        matching_iou_threshold=0.5,
+        nms_iou_threshold=1.0,
+        nms_max_output_boxes=10000,
+        use_weighted_mean_ap=False,
+        label_id_offset=0,
+    ):
+        if num_groundtruth_classes < 1:
+            raise ValueError(
+                'Need at least 1 groundtruth class for evaluation.')
+
+        self.per_image_eval = per_image_evaluation.PerImageEvaluation(
+            num_groundtruth_classes=num_groundtruth_classes,
+            matching_iou_threshold=matching_iou_threshold,
+        )
+        self.num_class = num_groundtruth_classes
+        self.use_weighted_mean_ap = use_weighted_mean_ap
+        self.label_id_offset = label_id_offset
+
+        self.groundtruth_boxes = {}
+        self.groundtruth_class_labels = {}
+        self.groundtruth_masks = {}
+        self.groundtruth_is_difficult_list = {}
+        self.groundtruth_is_group_of_list = {}
+        self.num_gt_instances_per_class = np.zeros(self.num_class, dtype=int)
+        self.num_gt_imgs_per_class = np.zeros(self.num_class, dtype=int)
+
+        self._initialize_detections()
+
+    def _initialize_detections(self):
+        self.detection_keys = set()
+        self.scores_per_class = [[] for _ in range(self.num_class)]
+        self.tp_fp_labels_per_class = [[] for _ in range(self.num_class)]
+        self.num_images_correctly_detected_per_class = np.zeros(self.num_class)
+        self.average_precision_per_class = np.empty(
+            self.num_class, dtype=float)
+        self.average_precision_per_class.fill(np.nan)
+        self.precisions_per_class = []
+        self.recalls_per_class = []
+        self.corloc_per_class = np.ones(self.num_class, dtype=float)
+
+    def clear_detections(self):
+        self._initialize_detections()
+
+    def add_single_ground_truth_image_info(
+        self,
+        image_key,
+        groundtruth_boxes,
+        groundtruth_class_labels,
+        groundtruth_is_difficult_list=None,
+        groundtruth_is_group_of_list=None,
+        groundtruth_masks=None,
+    ):
+        """Adds groundtruth for a single image to be used for evaluation.
+
+        Args:
+            image_key: A unique string/integer identifier for the image.
+            groundtruth_boxes: float32 numpy array of shape [num_boxes, 4]
+                containing `num_boxes` groundtruth boxes of the format
+                [ymin, xmin, ymax, xmax] in absolute image coordinates.
+            groundtruth_class_labels: integer numpy array of shape [num_boxes]
+                containing 0-indexed groundtruth classes for the boxes.
+            groundtruth_is_difficult_list: A length M numpy boolean array
+                denoting whether a ground truth box is a difficult instance or
+                not. To support the case that no boxes are difficult, it is by
+                default set as None.
+            groundtruth_is_group_of_list: A length M numpy boolean array
+                denoting whether a ground truth box is a group-of box or not.
+                To support the case that no boxes are groups-of, it is by
+                default set as None.
+            groundtruth_masks: uint8 numpy array of shape
+                [num_boxes, height, width] containing `num_boxes` groundtruth
+                masks. The mask values range from 0 to 1.
+        """
+        if image_key in self.groundtruth_boxes:
+            logging.warn(('image %s has already been added to the ground '
+                          'truth database.'), image_key)
+            return
+
+        self.groundtruth_boxes[image_key] = groundtruth_boxes
+        self.groundtruth_class_labels[image_key] = groundtruth_class_labels
+        self.groundtruth_masks[image_key] = groundtruth_masks
+        if groundtruth_is_difficult_list is None:
+            num_boxes = groundtruth_boxes.shape[0]
+            groundtruth_is_difficult_list = np.zeros(num_boxes, dtype=bool)
+        self.groundtruth_is_difficult_list[
+            image_key] = groundtruth_is_difficult_list.astype(dtype=bool)
+        if groundtruth_is_group_of_list is None:
+            num_boxes = groundtruth_boxes.shape[0]
+            groundtruth_is_group_of_list = np.zeros(num_boxes, dtype=bool)
+        self.groundtruth_is_group_of_list[
+            image_key] = groundtruth_is_group_of_list.astype(dtype=bool)
+
+        self._update_ground_truth_statistics(
+            groundtruth_class_labels,
+            groundtruth_is_difficult_list.astype(dtype=bool),
+            groundtruth_is_group_of_list.astype(dtype=bool),
+        )
+
+    def add_single_detected_image_info(
+        self,
+        image_key,
+        detected_boxes,
+        detected_scores,
+        detected_class_labels,
+        detected_masks=None,
+    ):
+        """Adds detections for a single image to be used for evaluation.
+
+        Args:
+            image_key: A unique string/integer identifier for the image.
+            detected_boxes: float32 numpy array of shape [num_boxes, 4]
+                containing `num_boxes` detection boxes of the format
+                [ymin, xmin, ymax, xmax] in absolute image coordinates.
+            detected_scores: float32 numpy array of shape [num_boxes]
+                containing detection scores for the boxes.
+            detected_class_labels: integer numpy array of shape [num_boxes]
+                containing 0-indexed detection classes for the boxes.
+            detected_masks: np.uint8 numpy array of shape
+                [num_boxes, height, width] containing `num_boxes` detection
+                masks with values ranging between 0 and 1.
+
+        Raises:
+            ValueError: if the number of boxes, scores and class labels differ
+                in length.
+        """
+        if len(detected_boxes) != len(detected_scores) or len(
+                detected_boxes) != len(detected_class_labels):
+            raise ValueError(
+                'detected_boxes, detected_scores and '
+                'detected_class_labels should all have same lengths. Got'
+                '[%d, %d, %d]' % len(detected_boxes),
+                len(detected_scores),
+                len(detected_class_labels),
+            )
+
+        if image_key in self.detection_keys:
+            logging.warn(('image %s has already been added to the ground '
+                          'truth database.'), image_key)
+            return
+
+        self.detection_keys.add(image_key)
+        if image_key in self.groundtruth_boxes:
+            groundtruth_boxes = self.groundtruth_boxes[image_key]
+            groundtruth_class_labels = self.groundtruth_class_labels[image_key]
+            # Masks are popped instead of look up. The reason is that we do not
+            # want to keep all masks in memory which can cause memory overflow.
+            groundtruth_masks = self.groundtruth_masks.pop(image_key)
+            groundtruth_is_difficult_list = self.groundtruth_is_difficult_list[
+                image_key]
+            groundtruth_is_group_of_list = self.groundtruth_is_group_of_list[
+                image_key]
+        else:
+            groundtruth_boxes = np.empty(shape=[0, 4], dtype=float)
+            groundtruth_class_labels = np.array([], dtype=int)
+            if detected_masks is None:
+                groundtruth_masks = None
+            else:
+                groundtruth_masks = np.empty(shape=[0, 1, 1], dtype=float)
+            groundtruth_is_difficult_list = np.array([], dtype=bool)
+            groundtruth_is_group_of_list = np.array([], dtype=bool)
+        (
+            scores,
+            tp_fp_labels,
+        ) = self.per_image_eval.compute_object_detection_metrics(
+            detected_boxes=detected_boxes,
+            detected_scores=detected_scores,
+            detected_class_labels=detected_class_labels,
+            groundtruth_boxes=groundtruth_boxes,
+            groundtruth_class_labels=groundtruth_class_labels,
+            groundtruth_is_difficult_list=groundtruth_is_difficult_list,
+            groundtruth_is_group_of_list=groundtruth_is_group_of_list,
+            detected_masks=detected_masks,
+            groundtruth_masks=groundtruth_masks,
+        )
+
+        for i in range(self.num_class):
+            if scores[i].shape[0] > 0:
+                self.scores_per_class[i].append(scores[i])
+                self.tp_fp_labels_per_class[i].append(tp_fp_labels[i])
+
+    def _update_ground_truth_statistics(
+        self,
+        groundtruth_class_labels,
+        groundtruth_is_difficult_list,
+        groundtruth_is_group_of_list,
+    ):
+        """Update grouth truth statitistics.
+
+        1. Difficult boxes are ignored when counting the number of ground truth
+        instances as done in Pascal VOC devkit.
+        2. Difficult boxes are treated as normal boxes when computing CorLoc
+        related statitistics.
+
+        Args:
+            groundtruth_class_labels: An integer numpy array of length M,
+                representing M class labels of object instances in ground truth
+            groundtruth_is_difficult_list: A boolean numpy array of length M
+                denoting whether a ground truth box is a difficult instance or
+                not
+            groundtruth_is_group_of_list: A boolean numpy array of length M
+                denoting whether a ground truth box is a group-of box or not
+        """
+        for class_index in range(self.num_class):
+            num_gt_instances = np.sum(groundtruth_class_labels[
+                ~groundtruth_is_difficult_list
+                & ~groundtruth_is_group_of_list] == class_index)
+            self.num_gt_instances_per_class[class_index] += num_gt_instances
+            if np.any(groundtruth_class_labels == class_index):
+                self.num_gt_imgs_per_class[class_index] += 1
+
+    def evaluate(self):
+        """Compute evaluation result.
+
+        Returns:
+            A named tuple with the following fields -
+                average_precision: float numpy array of average precision for
+                    each class.
+                mean_ap: mean average precision of all classes, float scalar
+                precisions: List of precisions, each precision is a float numpy
+                    array
+                recalls: List of recalls, each recall is a float numpy array
+                corloc: numpy float array
+                mean_corloc: Mean CorLoc score for each class, float scalar
+        """
+        if (self.num_gt_instances_per_class == 0).any():
+            print(
+                'The following classes have no ground truth examples: %s',
+                np.squeeze(np.argwhere(self.num_gt_instances_per_class == 0)) +
+                self.label_id_offset, "self.detection_keys:",self.detection_keys
+            )
+
+        if self.use_weighted_mean_ap:
+            all_scores = np.array([], dtype=float)
+            all_tp_fp_labels = np.array([], dtype=bool)
+
+        for class_index in range(self.num_class):
+            if self.num_gt_instances_per_class[class_index] == 0:
+                continue
+
+            if not self.scores_per_class[class_index]:
+                scores = np.array([], dtype=float)
+                tp_fp_labels = np.array([], dtype=bool)
+            else:
+                scores = np.concatenate(self.scores_per_class[class_index])
+                tp_fp_labels = np.concatenate(
+                    self.tp_fp_labels_per_class[class_index])
+            if self.use_weighted_mean_ap:
+                all_scores = np.append(all_scores, scores)
+                all_tp_fp_labels = np.append(all_tp_fp_labels, tp_fp_labels)
+            precision, recall = metrics.compute_precision_recall(
+                scores,
+                tp_fp_labels,
+                self.num_gt_instances_per_class[class_index],
+            )
+            self.precisions_per_class.append(precision)
+            self.recalls_per_class.append(recall)
+            average_precision = metrics.compute_average_precision(
+                precision, recall)
+            self.average_precision_per_class[class_index] = average_precision
+
+        self.corloc_per_class = metrics.compute_cor_loc(
+            self.num_gt_imgs_per_class,
+            self.num_images_correctly_detected_per_class,
+        )
+
+        if self.use_weighted_mean_ap:
+            num_gt_instances = np.sum(self.num_gt_instances_per_class)
+            precision, recall = metrics.compute_precision_recall(
+                all_scores, all_tp_fp_labels, num_gt_instances)
+            mean_ap = metrics.compute_average_precision(precision, recall)
+        else:
+            mean_ap = np.nanmean(self.average_precision_per_class)
+        mean_corloc = np.nanmean(self.corloc_per_class)
+        return ObjectDetectionEvalMetrics(
+            self.average_precision_per_class,
+            mean_ap,
+            self.precisions_per_class,
+            self.recalls_per_class,
+            self.corloc_per_class,
+            mean_corloc,
+        )
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py
new file mode 100644
index 0000000..3013ae7
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py
@@ -0,0 +1,452 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Evaluate Object Detection result on a single image.
+
+Annotate each detected result as true positives or false positive according to
+a predefined IOU ratio. Non Maximum Supression is used by default. Multi class
+detection is supported by default. Based on the settings, per image evaluation
+is either performed on boxes or on object masks.
+"""
+
+import numpy as np
+
+from . import np_box_list, np_box_ops
+
+
+class PerImageEvaluation:
+    """Evaluate detection result of a single image."""
+
+    def __init__(self, num_groundtruth_classes, matching_iou_threshold=0.5):
+        """Initialized PerImageEvaluation by evaluation parameters.
+
+        Args:
+            num_groundtruth_classes: Number of ground truth object classes
+            matching_iou_threshold: A ratio of area intersection to union,
+                which is the threshold to consider whether a detection is true
+                positive or not
+        """
+        self.matching_iou_threshold = matching_iou_threshold
+        self.num_groundtruth_classes = num_groundtruth_classes
+
+    def compute_object_detection_metrics(
+        self,
+        detected_boxes,
+        detected_scores,
+        detected_class_labels,
+        groundtruth_boxes,
+        groundtruth_class_labels,
+        groundtruth_is_difficult_list,
+        groundtruth_is_group_of_list,
+        detected_masks=None,
+        groundtruth_masks=None,
+    ):
+        """Evaluates detections as being tp, fp or ignored from a single image.
+
+        The evaluation is done in two stages:
+        1. All detections are matched to non group-of boxes; true positives
+            are determined and detections matched to difficult boxes are
+            ignored.
+        2. Detections that are determined as false positives are matched
+            against group-of boxes and ignored if matched.
+
+        Args:
+            detected_boxes: A float numpy array of shape [N, 4], representing N
+                regions of detected object regions.
+                Each row is of the format [y_min, x_min, y_max, x_max]
+            detected_scores: A float numpy array of shape [N, 1], representing
+                the confidence scores of the detected N object instances.
+            detected_class_labels: A integer numpy array of shape [N, 1],
+                repreneting the class labels of the detected N object
+                instances.
+            groundtruth_boxes: A float numpy array of shape [M, 4],
+                representing M regions of object instances in ground truth
+            groundtruth_class_labels: An integer numpy array of shape [M, 1],
+                representing M class labels of object instances in ground truth
+            groundtruth_is_difficult_list: A boolean numpy array of length M
+                denoting whether a ground truth box is a difficult instance or
+                not
+            groundtruth_is_group_of_list: A boolean numpy array of length M
+                denoting whether a ground truth box has group-of tag
+            detected_masks: (optional) A uint8 numpy array of shape
+                [N, height, width]. If not None, the metrics will be computed
+                based on masks.
+            groundtruth_masks: (optional) A uint8 numpy array of shape
+                [M, height, width].
+
+        Returns:
+            scores: A list of C float numpy arrays. Each numpy array is of
+                shape [K, 1], representing K scores detected with object class
+                label c
+            tp_fp_labels: A list of C boolean numpy arrays. Each numpy array
+                is of shape [K, 1], representing K True/False positive label of
+                object instances detected with class label c
+        """
+        (
+            detected_boxes,
+            detected_scores,
+            detected_class_labels,
+            detected_masks,
+        ) = self._remove_invalid_boxes(
+            detected_boxes,
+            detected_scores,
+            detected_class_labels,
+            detected_masks,
+        )
+        scores, tp_fp_labels = self._compute_tp_fp(
+            detected_boxes=detected_boxes,
+            detected_scores=detected_scores,
+            detected_class_labels=detected_class_labels,
+            groundtruth_boxes=groundtruth_boxes,
+            groundtruth_class_labels=groundtruth_class_labels,
+            groundtruth_is_difficult_list=groundtruth_is_difficult_list,
+            groundtruth_is_group_of_list=groundtruth_is_group_of_list,
+            detected_masks=detected_masks,
+            groundtruth_masks=groundtruth_masks,
+        )
+
+        return scores, tp_fp_labels
+
+    def _compute_tp_fp(
+        self,
+        detected_boxes,
+        detected_scores,
+        detected_class_labels,
+        groundtruth_boxes,
+        groundtruth_class_labels,
+        groundtruth_is_difficult_list,
+        groundtruth_is_group_of_list,
+        detected_masks=None,
+        groundtruth_masks=None,
+    ):
+        """Labels true/false positives of detections of an image across all
+        classes.
+
+        Args:
+            detected_boxes: A float numpy array of shape [N, 4], representing N
+                regions of detected object regions.
+                Each row is of the format [y_min, x_min, y_max, x_max]
+            detected_scores: A float numpy array of shape [N, 1], representing
+                the confidence scores of the detected N object instances.
+            detected_class_labels: A integer numpy array of shape [N, 1],
+                repreneting the class labels of the detected N object
+                instances.
+            groundtruth_boxes: A float numpy array of shape [M, 4],
+                representing M regions of object instances in ground truth
+            groundtruth_class_labels: An integer numpy array of shape [M, 1],
+                representing M class labels of object instances in ground truth
+            groundtruth_is_difficult_list: A boolean numpy array of length M
+                denoting whether a ground truth box is a difficult instance or
+                not
+            groundtruth_is_group_of_list: A boolean numpy array of length M
+                denoting whether a ground truth box has group-of tag
+            detected_masks: (optional) A np.uint8 numpy array of shape
+                [N, height, width]. If not None, the scores will be computed
+                based on masks.
+            groundtruth_masks: (optional) A np.uint8 numpy array of shape
+                [M, height, width].
+
+        Returns:
+            result_scores: A list of float numpy arrays. Each numpy array is of
+                shape [K, 1], representing K scores detected with object class
+                label c
+            result_tp_fp_labels: A list of boolean numpy array. Each numpy
+                array is of shape [K, 1], representing K True/False positive
+                label of object instances detected with class label c
+
+        Raises:
+            ValueError: If detected masks is not None but groundtruth masks are
+                None, or the other way around.
+        """
+        if detected_masks is not None and groundtruth_masks is None:
+            raise ValueError(
+                'Detected masks is available but groundtruth masks is not.')
+        if detected_masks is None and groundtruth_masks is not None:
+            raise ValueError(
+                'Groundtruth masks is available but detected masks is not.')
+
+        result_scores = []
+        result_tp_fp_labels = []
+        for i in range(self.num_groundtruth_classes):
+            groundtruth_is_difficult_list_at_ith_class = (
+                groundtruth_is_difficult_list[groundtruth_class_labels == i])
+            groundtruth_is_group_of_list_at_ith_class = (
+                groundtruth_is_group_of_list[groundtruth_class_labels == i])
+            (
+                gt_boxes_at_ith_class,
+                gt_masks_at_ith_class,
+                detected_boxes_at_ith_class,
+                detected_scores_at_ith_class,
+                detected_masks_at_ith_class,
+            ) = self._get_ith_class_arrays(detected_boxes, detected_scores,
+                                           detected_masks,
+                                           detected_class_labels,
+                                           groundtruth_boxes,
+                                           groundtruth_masks,
+                                           groundtruth_class_labels, i)
+            scores, tp_fp_labels = self._compute_tp_fp_for_single_class(
+                detected_boxes=detected_boxes_at_ith_class,
+                detected_scores=detected_scores_at_ith_class,
+                groundtruth_boxes=gt_boxes_at_ith_class,
+                groundtruth_is_difficult_list=(
+                    groundtruth_is_difficult_list_at_ith_class),
+                groundtruth_is_group_of_list=(
+                    groundtruth_is_group_of_list_at_ith_class),
+                detected_masks=detected_masks_at_ith_class,
+                groundtruth_masks=gt_masks_at_ith_class,
+            )
+            result_scores.append(scores)
+            result_tp_fp_labels.append(tp_fp_labels)
+        return result_scores, result_tp_fp_labels
+
+    def _get_overlaps_and_scores_box_mode(
+        self,
+        detected_boxes,
+        detected_scores,
+        groundtruth_boxes,
+        groundtruth_is_group_of_list,
+    ):
+        """Computes overlaps and scores between detected and groudntruth boxes.
+
+        Args:
+            detected_boxes: A numpy array of shape [N, 4] representing detected
+                box coordinates
+            detected_scores: A 1-d numpy array of length N representing
+                classification score
+            groundtruth_boxes: A numpy array of shape [M, 4] representing
+                ground truth box coordinates
+            groundtruth_is_group_of_list: A boolean numpy array of length M
+                denoting whether a ground truth box has group-of tag. If a
+                groundtruth box is group-of box, every detection matching this
+                box is ignored.
+
+        Returns:
+            iou: A float numpy array of size [num_detected_boxes,
+                num_gt_boxes]. If gt_non_group_of_boxlist.num_boxes() == 0 it
+                will be None.
+            ioa: A float numpy array of size [num_detected_boxes,
+                num_gt_boxes]. If gt_group_of_boxlist.num_boxes() == 0 it will
+                be None.
+            scores: The score of the detected boxlist.
+            num_boxes: Number of non-maximum suppressed detected boxes.
+        """
+        detected_boxlist = np_box_list.BoxList(detected_boxes)
+        detected_boxlist.add_field('scores', detected_scores)
+        gt_non_group_of_boxlist = np_box_list.BoxList(
+            groundtruth_boxes[~groundtruth_is_group_of_list])
+
+        iou = np_box_ops.iou(detected_boxlist.get(),
+                             gt_non_group_of_boxlist.get())
+        scores = detected_boxlist.get_field('scores')
+        num_boxes = detected_boxlist.num_boxes()
+        return iou, None, scores, num_boxes
+
+    def _compute_tp_fp_for_single_class(
+        self,
+        detected_boxes,
+        detected_scores,
+        groundtruth_boxes,
+        groundtruth_is_difficult_list,
+        groundtruth_is_group_of_list,
+        detected_masks=None,
+        groundtruth_masks=None,
+    ):
+        """Labels boxes detected with the same class from the same image as
+        tp/fp.
+
+        Args:
+            detected_boxes: A numpy array of shape [N, 4] representing detected
+                box coordinates
+            detected_scores: A 1-d numpy array of length N representing
+                classification score
+            groundtruth_boxes: A numpy array of shape [M, 4] representing
+                groundtruth box coordinates
+            groundtruth_is_difficult_list: A boolean numpy array of length M
+                denoting whether a ground truth box is a difficult instance or
+                not. If a groundtruth box is difficult, every detection
+                matching this box is ignored.
+            groundtruth_is_group_of_list: A boolean numpy array of length M
+                denoting whether a ground truth box has group-of tag. If a
+                groundtruth box is group-of box, every detection matching this
+                box is ignored.
+            detected_masks: (optional) A uint8 numpy array of shape
+                [N, height, width]. If not None, the scores will be computed
+                based on masks.
+            groundtruth_masks: (optional) A uint8 numpy array of shape
+                [M, height, width].
+
+        Returns:
+            Two arrays of the same size, containing all boxes that were
+            evaluated as being true positives or false positives; if a box
+            matched to a difficult box or to a group-of box, it is ignored.
+
+            scores: A numpy array representing the detection scores.
+            tp_fp_labels: a boolean numpy array indicating whether a detection
+                is a true positive.
+        """
+        if detected_boxes.size == 0:
+            return np.array([], dtype=float), np.array([], dtype=bool)
+
+        (
+            iou,
+            _,
+            scores,
+            num_detected_boxes,
+        ) = self._get_overlaps_and_scores_box_mode(
+            detected_boxes=detected_boxes,
+            detected_scores=detected_scores,
+            groundtruth_boxes=groundtruth_boxes,
+            groundtruth_is_group_of_list=groundtruth_is_group_of_list,
+        )
+
+        if groundtruth_boxes.size == 0:
+            return scores, np.zeros(num_detected_boxes, dtype=bool)
+
+        tp_fp_labels = np.zeros(num_detected_boxes, dtype=bool)
+        is_matched_to_difficult_box = np.zeros(num_detected_boxes, dtype=bool)
+        is_matched_to_group_of_box = np.zeros(num_detected_boxes, dtype=bool)
+
+        # The evaluation is done in two stages:
+        # 1. All detections are matched to non group-of boxes; true positives
+        #    are determined and detections matched to difficult boxes are
+        #    ignored.
+        # 2. Detections that are determined as false positives are matched
+        #    against group-of boxes and ignored if matched.
+
+        # Tp-fp evaluation for non-group of boxes (if any).
+        if iou.shape[1] > 0:
+            groundtruth_nongroup_of_is_difficult_list = (
+                groundtruth_is_difficult_list[~groundtruth_is_group_of_list])
+            max_overlap_gt_ids = np.argmax(iou, axis=1)
+            is_gt_box_detected = np.zeros(iou.shape[1], dtype=bool)
+            for i in range(num_detected_boxes):
+                gt_id = max_overlap_gt_ids[i]
+                if iou[i, gt_id] >= self.matching_iou_threshold:
+                    if not groundtruth_nongroup_of_is_difficult_list[gt_id]:
+                        if not is_gt_box_detected[gt_id]:
+                            tp_fp_labels[i] = True
+                            is_gt_box_detected[gt_id] = True
+                    else:
+                        is_matched_to_difficult_box[i] = True
+
+        return (
+            scores[~is_matched_to_difficult_box & ~is_matched_to_group_of_box],
+            tp_fp_labels[~is_matched_to_difficult_box
+                         & ~is_matched_to_group_of_box],
+        )
+
+    def _get_ith_class_arrays(
+        self,
+        detected_boxes,
+        detected_scores,
+        detected_masks,
+        detected_class_labels,
+        groundtruth_boxes,
+        groundtruth_masks,
+        groundtruth_class_labels,
+        class_index,
+    ):
+        """Returns numpy arrays belonging to class with index `class_index`.
+
+        Args:
+            detected_boxes: A numpy array containing detected boxes.
+            detected_scores: A numpy array containing detected scores.
+            detected_masks: A numpy array containing detected masks.
+            detected_class_labels: A numpy array containing detected class
+                labels.
+            groundtruth_boxes: A numpy array containing groundtruth boxes.
+            groundtruth_masks: A numpy array containing groundtruth masks.
+            groundtruth_class_labels: A numpy array containing groundtruth
+                class labels.
+            class_index: An integer index.
+
+        Returns:
+            gt_boxes_at_ith_class: A numpy array containing groundtruth boxes
+                labeled as ith class.
+            gt_masks_at_ith_class: A numpy array containing groundtruth masks
+                labeled as ith class.
+            detected_boxes_at_ith_class: A numpy array containing detected
+                boxes corresponding to the ith class.
+            detected_scores_at_ith_class: A numpy array containing detected
+                scores corresponding to the ith class.
+            detected_masks_at_ith_class: A numpy array containing detected
+                masks corresponding to the ith class.
+        """
+        selected_groundtruth = groundtruth_class_labels == class_index
+        gt_boxes_at_ith_class = groundtruth_boxes[selected_groundtruth]
+        if groundtruth_masks is not None:
+            gt_masks_at_ith_class = groundtruth_masks[selected_groundtruth]
+        else:
+            gt_masks_at_ith_class = None
+        selected_detections = detected_class_labels == class_index
+        detected_boxes_at_ith_class = detected_boxes[selected_detections]
+        detected_scores_at_ith_class = detected_scores[selected_detections]
+        if detected_masks is not None:
+            detected_masks_at_ith_class = detected_masks[selected_detections]
+        else:
+            detected_masks_at_ith_class = None
+        return (
+            gt_boxes_at_ith_class,
+            gt_masks_at_ith_class,
+            detected_boxes_at_ith_class,
+            detected_scores_at_ith_class,
+            detected_masks_at_ith_class,
+        )
+
+    def _remove_invalid_boxes(
+        self,
+        detected_boxes,
+        detected_scores,
+        detected_class_labels,
+        detected_masks=None,
+    ):
+        """Removes entries with invalid boxes.
+
+        A box is invalid if either its xmax is smaller than its xmin, or its
+        ymax is smaller than its ymin.
+
+        Args:
+            detected_boxes: A float numpy array of size [num_boxes, 4]
+                containing box coordinates in [ymin, xmin, ymax, xmax] format.
+            detected_scores: A float numpy array of size [num_boxes].
+            detected_class_labels: A int32 numpy array of size [num_boxes].
+            detected_masks: A uint8 numpy array of size
+                [num_boxes, height, width].
+
+        Returns:
+            valid_detected_boxes: A float numpy array of size
+                [num_valid_boxes, 4] containing box coordinates in
+                [ymin, xmin, ymax, xmax] format.
+            valid_detected_scores: A float numpy array of size
+                [num_valid_boxes].
+            valid_detected_class_labels: A int32 numpy array of size
+                [num_valid_boxes].
+            valid_detected_masks: A uint8 numpy array of size
+                [num_valid_boxes, height, width].
+        """
+        valid_indices = np.logical_and(
+            detected_boxes[:, 0] < detected_boxes[:, 2],
+            detected_boxes[:, 1] < detected_boxes[:, 3],
+        )
+        detected_boxes = detected_boxes[valid_indices]
+        detected_scores = detected_scores[valid_indices]
+        detected_class_labels = detected_class_labels[valid_indices]
+        if detected_masks is not None:
+            detected_masks = detected_masks[valid_indices]
+        return [
+            detected_boxes,
+            detected_scores,
+            detected_class_labels,
+            detected_masks,
+        ]
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/standard_fields.py b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/standard_fields.py
new file mode 100644
index 0000000..8edf46d
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/standard_fields.py
@@ -0,0 +1,115 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Contains classes specifying naming conventions used for object detection.
+
+Specifies:
+  InputDataFields: standard fields used by reader/preprocessor/batcher.
+  DetectionResultFields: standard fields returned by object detector.
+"""
+
+
+class InputDataFields:
+    """Names for the input tensors.
+
+    Holds the standard data field names to use for identifying input tensors.
+    This should be used by the decoder to identify keys for the returned
+    tensor_dict containing input tensors. And it should be used by the model to
+    identify the tensors it needs.
+
+    Attributes:
+        image: image.
+        original_image: image in the original input size.
+        key: unique key corresponding to image.
+        source_id: source of the original image.
+        filename: original filename of the dataset (without common path).
+        groundtruth_image_classes: image-level class labels.
+        groundtruth_boxes: coordinates of the ground truth boxes in the image.
+        groundtruth_classes: box-level class labels.
+        groundtruth_label_types: box-level label types (e.g. explicit
+            negative).
+        groundtruth_is_crowd: [DEPRECATED, use groundtruth_group_of instead]
+            is the groundtruth a single object or a crowd.
+        groundtruth_area: area of a groundtruth segment.
+        groundtruth_difficult: is a `difficult` object
+        groundtruth_group_of: is a `group_of` objects, e.g. multiple objects of
+            the same class, forming a connected group, where instances are
+            heavily occluding each other.
+        proposal_boxes: coordinates of object proposal boxes.
+        proposal_objectness: objectness score of each proposal.
+        groundtruth_instance_masks: ground truth instance masks.
+        groundtruth_instance_boundaries: ground truth instance boundaries.
+        groundtruth_instance_classes: instance mask-level class labels.
+        groundtruth_keypoints: ground truth keypoints.
+        groundtruth_keypoint_visibilities: ground truth keypoint visibilities.
+        groundtruth_label_scores: groundtruth label scores.
+        groundtruth_weights: groundtruth weight factor for bounding boxes.
+        num_groundtruth_boxes: number of groundtruth boxes.
+        true_image_shapes: true shapes of images in the resized images, as
+            resized images can be padded with zeros.
+    """
+
+    image = 'image'
+    original_image = 'original_image'
+    key = 'key'
+    source_id = 'source_id'
+    filename = 'filename'
+    groundtruth_image_classes = 'groundtruth_image_classes'
+    groundtruth_boxes = 'groundtruth_boxes'
+    groundtruth_classes = 'groundtruth_classes'
+    groundtruth_label_types = 'groundtruth_label_types'
+    groundtruth_is_crowd = 'groundtruth_is_crowd'
+    groundtruth_area = 'groundtruth_area'
+    groundtruth_difficult = 'groundtruth_difficult'
+    groundtruth_group_of = 'groundtruth_group_of'
+    proposal_boxes = 'proposal_boxes'
+    proposal_objectness = 'proposal_objectness'
+    groundtruth_instance_masks = 'groundtruth_instance_masks'
+    groundtruth_instance_boundaries = 'groundtruth_instance_boundaries'
+    groundtruth_instance_classes = 'groundtruth_instance_classes'
+    groundtruth_keypoints = 'groundtruth_keypoints'
+    groundtruth_keypoint_visibilities = 'groundtruth_keypoint_visibilities'
+    groundtruth_label_scores = 'groundtruth_label_scores'
+    groundtruth_weights = 'groundtruth_weights'
+    num_groundtruth_boxes = 'num_groundtruth_boxes'
+    true_image_shape = 'true_image_shape'
+
+
+class DetectionResultFields:
+    """Naming conventions for storing the output of the detector.
+
+    Attributes:
+        source_id: source of the original image.
+        key: unique key corresponding to image.
+        detection_boxes: coordinates of the detection boxes in the image.
+        detection_scores: detection scores for the detection boxes in the
+            image.
+        detection_classes: detection-level class labels.
+        detection_masks: contains a segmentation mask for each detection box.
+        detection_boundaries: contains an object boundary for each detection
+            box.
+        detection_keypoints: contains detection keypoints for each detection
+            box.
+        num_detections: number of detections in the batch.
+    """
+
+    source_id = 'source_id'
+    key = 'key'
+    detection_boxes = 'detection_boxes'
+    detection_scores = 'detection_scores'
+    detection_classes = 'detection_classes'
+    detection_masks = 'detection_masks'
+    detection_boundaries = 'detection_boundaries'
+    detection_keypoints = 'detection_keypoints'
+    num_detections = 'num_detections'
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_metric.py b/Bank_second_part/detect_process/paddlevideo/metrics/ava_metric.py
new file mode 100644
index 0000000..b17c8c8
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/ava_metric.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+import paddle
+from collections import OrderedDict
+from paddlevideo.utils import get_logger, load, log_batch, AverageMeter
+from .registry import METRIC
+from .base import BaseMetric
+import time
+from datetime import datetime
+from .ava_utils import ava_evaluate_results
+
+logger = get_logger("paddlevideo")
+""" An example for metrics class.
+    MultiCropMetric for slowfast.
+"""
+
+
+@METRIC.register
+class AVAMetric(BaseMetric):
+    def __init__(self,
+                 data_size,
+                 batch_size,
+                 file_path,
+                 exclude_file,
+                 label_file,
+                 custom_classes,
+                 log_interval=1):
+        """prepare for metrics
+        """
+        super().__init__(data_size, batch_size, log_interval)
+
+        self.file_path = file_path
+        self.exclude_file = exclude_file
+        self.label_file = label_file
+        self.custom_classes = custom_classes
+
+        self.results = []
+
+        record_list = [
+            ("loss", AverageMeter('loss', '7.5f')),
+            ("recall@thr=0.5", AverageMeter("recall@thr=0.5", '.5f')),
+            ("prec@thr=0.5", AverageMeter("prec@thr=0.5", '.5f')),
+            ("recall@top3", AverageMeter("recall@top3", '.5f')),
+            ("prec@top3", AverageMeter("prec@top3", '.5f')),
+            ("recall@top5", AverageMeter("recall@top5", '.5f')),
+            ("prec@top5", AverageMeter("prec@top5", '.5f')),
+            ("mAP@0.5IOU", AverageMeter("mAP@0.5IOU", '.5f')),
+            ("batch_time", AverageMeter('batch_cost', '.5f')),
+            ("reader_time", AverageMeter('reader_cost', '.5f')),
+        ]
+
+        self.record_list = OrderedDict(record_list)
+
+        self.tic = time.time()
+
+    def update(self, batch_id, data, outputs):
+        """update metrics during each iter
+        """
+
+        self.results.extend(outputs)
+        self.record_list['batch_time'].update(time.time() - self.tic)
+        tic = time.time()
+        ips = "ips: {:.5f} instance/sec.".format(
+            self.batch_size / self.record_list["batch_time"].val)
+        log_batch(self.record_list, batch_id, 0, 0, "test", ips)
+
+    def set_dataset_info(self, info, dataset_len):
+        self.info = info
+        self.dataset_len = dataset_len
+
+    def accumulate(self):
+        """accumulate metrics when finished all iters.
+        """
+        test_res = ava_evaluate_results(self.info, self.dataset_len,
+                                        self.results, None, self.label_file,
+                                        self.file_path, self.exclude_file)
+
+        for name, value in test_res.items():
+            self.record_list[name].update(value, self.batch_size)
+
+        return self.record_list
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_utils.py b/Bank_second_part/detect_process/paddlevideo/metrics/ava_utils.py
new file mode 100644
index 0000000..b127267
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/ava_utils.py
@@ -0,0 +1,394 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import heapq
+import logging
+import time
+from collections import defaultdict
+from .ava_evaluation import object_detection_evaluation as det_eval
+from .ava_evaluation import standard_fields
+from .recall import eval_recalls
+import shutil
+import pickle
+import time
+import os
+import os.path as osp
+from paddlevideo.utils import get_logger, get_dist_info
+import paddle.distributed as dist
+import sys
+import numpy as np
+from pathlib import Path
+from datetime import datetime
+import paddle
+
+
+def det2csv(info, dataset_len, results, custom_classes):
+    csv_results = []
+    for idx in range(dataset_len):
+        video_id = info[idx]['video_id']
+        timestamp = info[idx]['timestamp']
+
+        result = results[idx]
+        for label, _ in enumerate(result):
+            for bbox in result[label]:
+                if type(bbox) == paddle.Tensor:
+                    bbox = bbox.numpy()
+                
+                bbox_ = tuple(bbox.tolist())
+                if custom_classes is not None:
+                    actual_label = custom_classes[label + 1]
+                else:
+                    actual_label = label + 1
+                csv_results.append((
+                    video_id,
+                    timestamp,
+                ) + bbox_[:4] + (actual_label, ) + bbox_[4:])
+    return csv_results
+
+
+# results is organized by class
+def results2csv(info, dataset_len, results, out_file, custom_classes=None):
+    if isinstance(results[0], list):
+        csv_results = det2csv(info, dataset_len, results, custom_classes)
+
+    # save space for float
+    def tostr(item):
+        if isinstance(item, float):
+            return f'{item:.3f}'
+        return str(item)
+
+    with open(out_file, 'w') as f:
+        for csv_result in csv_results:
+            f.write(','.join(map(lambda x: tostr(x), csv_result)))
+            f.write('\n')
+
+
+def print_time(message, start):
+    print('==> %g seconds to %s' % (time.time() - start, message))
+
+
+def make_image_key(video_id, timestamp):
+    """Returns a unique identifier for a video id & timestamp."""
+    return f'{video_id},{int(timestamp):04d}'
+
+
+def read_csv(csv_file, class_whitelist=None, capacity=0):
+    """Loads boxes and class labels from a CSV file in the AVA format.
+
+    CSV file format described at https://research.google.com/ava/download.html.
+
+    Args:
+        csv_file: A file object.
+        class_whitelist: If provided, boxes corresponding to (integer) class
+        labels not in this set are skipped.
+        capacity: Maximum number of labeled boxes allowed for each example.
+        Default is 0 where there is no limit.
+
+    Returns:
+        boxes: A dictionary mapping each unique image key (string) to a list of
+        boxes, given as coordinates [y1, x1, y2, x2].
+        labels: A dictionary mapping each unique image key (string) to a list
+        of integer class lables, matching the corresponding box in `boxes`.
+        scores: A dictionary mapping each unique image key (string) to a list
+        of score values lables, matching the corresponding label in `labels`.
+        If scores are not provided in the csv, then they will default to 1.0.
+    """
+    start = time.time()
+    entries = defaultdict(list)
+    boxes = defaultdict(list)
+    labels = defaultdict(list)
+    scores = defaultdict(list)
+    reader = csv.reader(csv_file)
+    for row in reader:
+        assert len(row) in [7, 8], 'Wrong number of columns: ' + row
+        image_key = make_image_key(row[0], row[1])
+        x1, y1, x2, y2 = [float(n) for n in row[2:6]]
+        action_id = int(row[6])
+        if class_whitelist and action_id not in class_whitelist:
+            continue
+
+        score = 1.0
+        if len(row) == 8:
+            score = float(row[7])
+        if capacity < 1 or len(entries[image_key]) < capacity:
+            heapq.heappush(entries[image_key],
+                           (score, action_id, y1, x1, y2, x2))
+        elif score > entries[image_key][0][0]:
+            heapq.heapreplace(entries[image_key],
+                              (score, action_id, y1, x1, y2, x2))
+    for image_key in entries:
+        # Evaluation API assumes boxes with descending scores
+        entry = sorted(entries[image_key], key=lambda tup: -tup[0])
+        for item in entry:
+            score, action_id, y1, x1, y2, x2 = item
+            boxes[image_key].append([y1, x1, y2, x2])
+            labels[image_key].append(action_id)
+            scores[image_key].append(score)
+    print_time('read file ' + csv_file.name, start)
+    return boxes, labels, scores
+
+
+def read_exclusions(exclusions_file):
+    """Reads a CSV file of excluded timestamps.
+
+    Args:
+        exclusions_file: A file object containing a csv of video-id,timestamp.
+
+    Returns:
+        A set of strings containing excluded image keys, e.g.
+        "aaaaaaaaaaa,0904",
+        or an empty set if exclusions file is None.
+    """
+    excluded = set()
+    if exclusions_file:
+        reader = csv.reader(exclusions_file)
+    for row in reader:
+        assert len(row) == 2, 'Expected only 2 columns, got: ' + row
+        excluded.add(make_image_key(row[0], row[1]))
+    return excluded
+
+
+def read_labelmap(labelmap_file):
+    """Reads a labelmap without the dependency on protocol buffers.
+
+    Args:
+        labelmap_file: A file object containing a label map protocol buffer.
+
+    Returns:
+        labelmap: The label map in the form used by the
+        object_detection_evaluation
+        module - a list of {"id": integer, "name": classname } dicts.
+        class_ids: A set containing all of the valid class id integers.
+    """
+    labelmap = []
+    class_ids = set()
+    name = ''
+    class_id = ''
+    for line in labelmap_file:
+        if line.startswith('  name:'):
+            name = line.split('"')[1]
+        elif line.startswith('  id:') or line.startswith('  label_id:'):
+            class_id = int(line.strip().split(' ')[-1])
+            labelmap.append({'id': class_id, 'name': name})
+            class_ids.add(class_id)
+    return labelmap, class_ids
+
+
+# Seems there is at most 100 detections for each image
+def ava_eval(result_file,
+             result_type,
+             label_file,
+             ann_file,
+             exclude_file,
+             max_dets=(100, ),
+             verbose=True,
+             custom_classes=None):
+
+    assert result_type in ['mAP']
+    start = time.time()
+    categories, class_whitelist = read_labelmap(open(label_file))
+
+    if custom_classes is not None:
+        custom_classes = custom_classes[1:]
+        assert set(custom_classes).issubset(set(class_whitelist))
+        class_whitelist = custom_classes
+        categories = [cat for cat in categories if cat['id'] in custom_classes]
+
+    # loading gt, do not need gt score
+    gt_boxes, gt_labels, _ = read_csv(open(ann_file), class_whitelist, 0)
+    if verbose:
+        print_time('Reading detection results', start)
+
+    if exclude_file is not None:
+        excluded_keys = read_exclusions(open(exclude_file))
+    else:
+        excluded_keys = list()
+
+    start = time.time()
+    boxes, labels, scores = read_csv(open(result_file), class_whitelist, 0)
+    if verbose:
+        print_time('Reading detection results', start)
+
+    if result_type == 'proposal':
+        gts = [
+            np.array(gt_boxes[image_key], dtype=float) for image_key in gt_boxes
+        ]
+        proposals = []
+        for image_key in gt_boxes:
+            if image_key in boxes:
+                proposals.append(
+                    np.concatenate(
+                        (np.array(boxes[image_key], dtype=float),
+                         np.array(scores[image_key], dtype=float)[:, None]),
+                        axis=1))
+            else:
+                # if no corresponding proposal, add a fake one
+                proposals.append(np.array([0, 0, 1, 1, 1]))
+
+        # Proposals used here are with scores
+        recalls = eval_recalls(gts, proposals, np.array(max_dets),
+                               np.arange(0.5, 0.96, 0.05))
+        ar = recalls.mean(axis=1)
+        ret = {}
+        for i, num in enumerate(max_dets):
+            print(f'Recall@0.5@{num}\t={recalls[i, 0]:.4f}')
+            print(f'AR@{num}\t={ar[i]:.4f}')
+            ret[f'Recall@0.5@{num}'] = recalls[i, 0]
+            ret[f'AR@{num}'] = ar[i]
+        return ret
+
+    if result_type == 'mAP':
+        pascal_evaluator = det_eval.PascalDetectionEvaluator(categories)
+
+        start = time.time()
+        for image_key in gt_boxes:
+            if verbose and image_key in excluded_keys:
+                logging.info(
+                    'Found excluded timestamp in detections: %s.'
+                    'It will be ignored.', image_key)
+                continue
+            pascal_evaluator.add_single_ground_truth_image_info(
+                image_key, {
+                    standard_fields.InputDataFields.groundtruth_boxes:
+                    np.array(gt_boxes[image_key], dtype=float),
+                    standard_fields.InputDataFields.groundtruth_classes:
+                    np.array(gt_labels[image_key], dtype=int),
+                    standard_fields.InputDataFields.groundtruth_difficult:
+                    np.zeros(len(gt_boxes[image_key]), dtype=bool)
+                })
+        if verbose:
+            print_time('Convert groundtruth', start)
+
+        start = time.time()
+        for image_key in boxes:
+            if verbose and image_key in excluded_keys:
+                logging.info(
+                    'Found excluded timestamp in detections: %s.'
+                    'It will be ignored.', image_key)
+                continue
+            pascal_evaluator.add_single_detected_image_info(
+                image_key, {
+                    standard_fields.DetectionResultFields.detection_boxes:
+                    np.array(boxes[image_key], dtype=float),
+                    standard_fields.DetectionResultFields.detection_classes:
+                    np.array(labels[image_key], dtype=int),
+                    standard_fields.DetectionResultFields.detection_scores:
+                    np.array(scores[image_key], dtype=float)
+                })
+        if verbose:
+            print_time('convert detections', start)
+
+        start = time.time()
+        metrics = pascal_evaluator.evaluate()
+        if verbose:
+            print_time('run_evaluator', start)
+        for display_name in metrics:
+            print(f'{display_name}=\t{metrics[display_name]}')
+        ret = {
+            display_name: metrics[display_name]
+            for display_name in metrics if 'ByCategory' not in display_name
+        }
+        return ret
+
+
+def mkdir_or_exist(dir_name, mode=0o777):
+    if dir_name == '':
+        return
+    dir_name = osp.expanduser(dir_name)
+    os.makedirs(dir_name, mode=mode, exist_ok=True)
+
+
+def dump_to_fileobj(obj, file, **kwargs):
+    kwargs.setdefault('protocol', 2)
+    pickle.dump(obj, file, **kwargs)
+
+
+def dump_to_path(obj, filepath, mode='wb'):
+    with open(filepath, mode) as f:
+        dump_to_fileobj(obj, f)
+
+
+def load_from_fileobj(file, **kwargs):
+    return pickle.load(file, **kwargs)
+
+
+def load_from_path(filepath, mode='rb'):
+    with open(filepath, mode) as f:
+        return load_from_fileobj(f)
+
+
+def collect_results_cpu(result_part, size):
+    """Collect results in cpu mode.
+    It saves the results on different gpus to 'tmpdir' and collects
+    them by the rank 0 worker.
+    """
+    tmpdir = osp.join('./', 'collect_results_cpu')
+    #1. load results of all parts from tmp dir
+    mkdir_or_exist(tmpdir)
+    rank, world_size = get_dist_info()
+    dump_to_path(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+    dist.barrier()
+    if rank != 0:
+        return None
+    #2. collect all parts
+    while 1:
+        all_exist = True
+        for i in range(world_size):
+            part_file = osp.join(tmpdir, f'part_{i}.pkl')
+            if not Path(part_file).exists():
+                all_exist = False
+        if all_exist:
+            break
+        else:
+            time.sleep(60)
+    time.sleep(120)
+    #3. load results of all parts from tmp dir
+    part_list = []
+    for i in range(world_size):
+        part_file = osp.join(tmpdir, f'part_{i}.pkl')
+        part_list.append(load_from_path(part_file))
+    #4. sort the results
+    ordered_results = []
+    for res in zip(*part_list):
+        ordered_results.extend(list(res))
+    ordered_results = ordered_results[:
+                                      size]  #the dataloader may pad some samples
+    #5. remove results of all parts from tmp dir, avoid dump_file fail to tmp dir when dir not exists.
+    for i in range(world_size):
+        part_file = osp.join(tmpdir, f'part_{i}.pkl')
+        os.remove(part_file)
+
+    return ordered_results
+
+
+def ava_evaluate_results(info, dataset_len, results, custom_classes, label_file,
+                         file_path, exclude_file):
+    # need to create a temp result file
+    time_now = datetime.now().strftime('%Y%m%d_%H%M%S')
+    temp_file = f'AVA_{time_now}_result.csv'
+    results2csv(info, dataset_len, results, temp_file)
+    ret = {}
+    eval_result = ava_eval(
+        temp_file,
+        'mAP',
+        label_file,
+        file_path,  #ann_file,
+        exclude_file,
+        custom_classes=custom_classes)
+    ret.update(eval_result)
+
+    os.remove(temp_file)
+
+    return ret
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/base.py b/Bank_second_part/detect_process/paddlevideo/metrics/base.py
new file mode 100644
index 0000000..9842232
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/base.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from abc import abstractmethod
+
+import paddle
+from paddlevideo.utils import get_dist_info
+
+from .registry import METRIC
+
+
+class BaseMetric(object):
+    def __init__(self, data_size, batch_size, log_interval=1, **kwargs):
+        self.data_size = data_size
+        self.batch_size = batch_size
+        _, self.world_size = get_dist_info()
+        self.log_interval = log_interval
+
+    def gather_from_gpu(self,
+                        gather_object: paddle.Tensor,
+                        concat_axis=0) -> paddle.Tensor:
+        """gather Tensor from all gpus into a list and concatenate them on `concat_axis`.
+
+        Args:
+            gather_object (paddle.Tensor): gather object Tensor
+            concat_axis (int, optional): axis for concatenation. Defaults to 0.
+
+        Returns:
+            paddle.Tensor: gatherd & concatenated Tensor
+        """
+        gather_object_list = []
+        paddle.distributed.all_gather(gather_object_list, gather_object.cuda())
+        return paddle.concat(gather_object_list, axis=concat_axis)
+
+    @abstractmethod
+    def update(self):
+        raise NotImplementedError(
+            "'update' method must be implemented in subclass")
+
+    @abstractmethod
+    def accumulate(self):
+        raise NotImplementedError(
+            "'accumulate' method must be implemented in subclass")
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/bmn_metric.py b/Bank_second_part/detect_process/paddlevideo/metrics/bmn_metric.py
new file mode 100644
index 0000000..cc36283
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/bmn_metric.py
@@ -0,0 +1,304 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import os
+import json
+import numpy as np
+import pandas as pd
+import multiprocessing as mp
+
+from .registry import METRIC
+from .base import BaseMetric
+from .ActivityNet import ANETproposal
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+def iou_with_anchors(anchors_min, anchors_max, box_min, box_max):
+    """Compute jaccard score between a box and the anchors.
+    """
+    len_anchors = anchors_max - anchors_min
+    int_xmin = np.maximum(anchors_min, box_min)
+    int_xmax = np.minimum(anchors_max, box_max)
+    inter_len = np.maximum(int_xmax - int_xmin, 0.)
+    union_len = len_anchors - inter_len + box_max - box_min
+    jaccard = np.divide(inter_len, union_len)
+    return jaccard
+
+
+def boundary_choose(score_list):
+    """Choose start and end boundary from score.
+    """
+    max_score = max(score_list)
+    mask_high = (score_list > max_score * 0.5)
+    score_list = list(score_list)
+    score_middle = np.array([0.0] + score_list + [0.0])
+    score_front = np.array([0.0, 0.0] + score_list)
+    score_back = np.array(score_list + [0.0, 0.0])
+    mask_peak = ((score_middle > score_front) & (score_middle > score_back))
+    mask_peak = mask_peak[1:-1]
+    mask = (mask_high | mask_peak).astype('float32')
+    return mask
+
+
+def soft_nms(df, alpha, t1, t2):
+    '''
+    df: proposals generated by network;
+    alpha: alpha value of Gaussian decaying function;
+    t1, t2: threshold for soft nms.
+    '''
+    df = df.sort_values(by="score", ascending=False)
+    tstart = list(df.xmin.values[:])
+    tend = list(df.xmax.values[:])
+    tscore = list(df.score.values[:])
+
+    rstart = []
+    rend = []
+    rscore = []
+
+    while len(tscore) > 1 and len(rscore) < 101:
+        max_index = tscore.index(max(tscore))
+        tmp_iou_list = iou_with_anchors(np.array(tstart), np.array(tend),
+                                        tstart[max_index], tend[max_index])
+        for idx in range(0, len(tscore)):
+            if idx != max_index:
+                tmp_iou = tmp_iou_list[idx]
+                tmp_width = tend[max_index] - tstart[max_index]
+                if tmp_iou > t1 + (t2 - t1) * tmp_width:
+                    tscore[idx] = tscore[idx] * np.exp(
+                        -np.square(tmp_iou) / alpha)
+
+        rstart.append(tstart[max_index])
+        rend.append(tend[max_index])
+        rscore.append(tscore[max_index])
+        tstart.pop(max_index)
+        tend.pop(max_index)
+        tscore.pop(max_index)
+
+    newDf = pd.DataFrame()
+    newDf['score'] = rscore
+    newDf['xmin'] = rstart
+    newDf['xmax'] = rend
+    return newDf
+
+
+@METRIC.register
+class BMNMetric(BaseMetric):
+    """
+    Metrics for BMN. Two Stages in this metric:
+    (1) Get test results using trained model, results will be saved in BMNMetric.result_path;
+    (2) Calculate metrics using results file from stage (1).
+    """
+
+    def __init__(self,
+                 data_size,
+                 batch_size,
+                 tscale,
+                 dscale,
+                 file_path,
+                 ground_truth_filename,
+                 subset,
+                 output_path,
+                 result_path,
+                 get_metrics=True,
+                 log_interval=1):
+        """
+        Init for BMN metrics.
+        Params:
+            get_metrics: whether to calculate AR@N and AUC metrics or not, default True.
+        """
+        super().__init__(data_size, batch_size, log_interval)
+        assert self.batch_size == 1, " Now we just support batch_size==1 test"
+        assert self.world_size == 1, " Now we just support single-card test"
+
+        self.tscale = tscale
+        self.dscale = dscale
+        self.file_path = file_path
+        self.ground_truth_filename = ground_truth_filename
+        self.subset = subset
+        self.output_path = output_path
+        self.result_path = result_path
+        self.get_metrics = get_metrics
+
+        if not os.path.isdir(self.output_path):
+            os.makedirs(self.output_path)
+        if not os.path.isdir(self.result_path):
+            os.makedirs(self.result_path)
+
+        self.video_dict, self.video_list = self.get_dataset_dict(
+            self.file_path, self.subset)
+
+    def get_dataset_dict(self, file_path, subset):
+        annos = json.load(open(file_path))
+        video_dict = {}
+        for video_name in annos.keys():
+            video_subset = annos[video_name]["subset"]
+            if subset in video_subset:
+                video_dict[video_name] = annos[video_name]
+        video_list = list(video_dict.keys())
+        video_list.sort()
+        return video_dict, video_list
+
+    def update(self, batch_id, data, outputs):
+        """update metrics during each iter
+        """
+        fid = data[4].numpy()
+        pred_bm, pred_start, pred_end = outputs
+        pred_bm = pred_bm.numpy()
+        pred_start = pred_start[0].numpy()
+        pred_end = pred_end[0].numpy()
+
+        snippet_xmins = [1.0 / self.tscale * i for i in range(self.tscale)]
+        snippet_xmaxs = [
+            1.0 / self.tscale * i for i in range(1, self.tscale + 1)
+        ]
+        cols = ["xmin", "xmax", "score"]
+
+        video_name = self.video_list[fid[0]]
+        pred_bm = pred_bm[0, 0, :, :] * pred_bm[0, 1, :, :]
+        start_mask = boundary_choose(pred_start)
+        start_mask[0] = 1.
+        end_mask = boundary_choose(pred_end)
+        end_mask[-1] = 1.
+        score_vector_list = []
+        for idx in range(self.dscale):
+            for jdx in range(self.tscale):
+                start_index = jdx
+                end_index = start_index + idx
+                if end_index < self.tscale and start_mask[
+                        start_index] == 1 and end_mask[end_index] == 1:
+                    xmin = snippet_xmins[start_index]
+                    xmax = snippet_xmaxs[end_index]
+                    xmin_score = pred_start[start_index]
+                    xmax_score = pred_end[end_index]
+                    bm_score = pred_bm[idx, jdx]
+                    conf_score = xmin_score * xmax_score * bm_score
+                    score_vector_list.append([xmin, xmax, conf_score])
+
+        score_vector_list = np.stack(score_vector_list)
+        video_df = pd.DataFrame(score_vector_list, columns=cols)
+        video_df.to_csv(os.path.join(self.output_path, "%s.csv" % video_name),
+                        index=False)
+
+        if batch_id % self.log_interval == 0:
+            logger.info("Processing................ batch {}".format(batch_id))
+
+    def accumulate(self):
+        """accumulate metrics when finished all iters.
+        """
+        # check clip index of each video
+        #Stage1
+        self.bmn_post_processing(self.video_dict, self.subset, self.output_path,
+                                 self.result_path)
+        if self.get_metrics:
+            logger.info("[TEST] calculate metrics...")
+            #Stage2
+            uniform_average_nr_proposals_valid, uniform_average_recall_valid, uniform_recall_valid = self.cal_metrics(
+                self.ground_truth_filename,
+                os.path.join(self.result_path, "bmn_results_validation.json"),
+                max_avg_nr_proposals=100,
+                tiou_thresholds=np.linspace(0.5, 0.95, 10),
+                subset='validation')
+            logger.info("AR@1; AR@5; AR@10; AR@100")
+            logger.info("%.02f %.02f %.02f %.02f" %
+                        (100 * np.mean(uniform_recall_valid[:, 0]),
+                         100 * np.mean(uniform_recall_valid[:, 4]),
+                         100 * np.mean(uniform_recall_valid[:, 9]),
+                         100 * np.mean(uniform_recall_valid[:, -1])))
+
+    def bmn_post_processing(self, video_dict, subset, output_path, result_path):
+        video_list = list(video_dict.keys())
+        global result_dict
+        result_dict = mp.Manager().dict()
+        pp_num = 12
+
+        num_videos = len(video_list)
+        num_videos_per_thread = int(num_videos / pp_num)
+        processes = []
+        for tid in range(pp_num - 1):
+            tmp_video_list = video_list[tid * num_videos_per_thread:(tid + 1) *
+                                        num_videos_per_thread]
+            p = mp.Process(target=self.video_process,
+                           args=(tmp_video_list, video_dict, output_path,
+                                 result_dict))
+            p.start()
+            processes.append(p)
+        tmp_video_list = video_list[(pp_num - 1) * num_videos_per_thread:]
+        p = mp.Process(target=self.video_process,
+                       args=(tmp_video_list, video_dict, output_path,
+                             result_dict))
+        p.start()
+        processes.append(p)
+        for p in processes:
+            p.join()
+
+        result_dict = dict(result_dict)
+        output_dict = {
+            "version": "VERSION 1.3",
+            "results": result_dict,
+            "external_data": {}
+        }
+        outfile = open(
+            os.path.join(result_path, "bmn_results_%s.json" % subset), "w")
+
+        # json.dump(output_dict, outfile)
+        # in case of file name in chinese
+        json.dump(output_dict, outfile, ensure_ascii=False)
+        outfile.close()
+
+    def video_process(self,
+                      video_list,
+                      video_dict,
+                      output_path,
+                      result_dict,
+                      snms_alpha=0.4,
+                      snms_t1=0.55,
+                      snms_t2=0.9):
+
+        for video_name in video_list:
+            logger.info("Processing video........" + video_name)
+            df = pd.read_csv(os.path.join(output_path, video_name + ".csv"))
+            if len(df) > 1:
+                df = soft_nms(df, snms_alpha, snms_t1, snms_t2)
+
+            video_duration = video_dict[video_name]["duration_second"]
+            proposal_list = []
+            for idx in range(min(100, len(df))):
+                tmp_prop={"score":df.score.values[idx], \
+                          "segment":[max(0,df.xmin.values[idx])*video_duration, \
+                                     min(1,df.xmax.values[idx])*video_duration]}
+                proposal_list.append(tmp_prop)
+
+            video_name = video_name[2:] if video_name[:2] == 'v_' else video_name
+            result_dict[video_name] = proposal_list
+
+    def cal_metrics(self,
+                    ground_truth_filename,
+                    proposal_filename,
+                    max_avg_nr_proposals=100,
+                    tiou_thresholds=np.linspace(0.5, 0.95, 10),
+                    subset='validation'):
+
+        anet_proposal = ANETproposal(ground_truth_filename,
+                                     proposal_filename,
+                                     tiou_thresholds=tiou_thresholds,
+                                     max_avg_nr_proposals=max_avg_nr_proposals,
+                                     subset=subset,
+                                     verbose=True,
+                                     check_status=False)
+        anet_proposal.evaluate()
+        recall = anet_proposal.recall
+        average_recall = anet_proposal.avg_recall
+        average_nr_proposals = anet_proposal.proposals_per_video
+
+        return (average_nr_proposals, average_recall, recall)
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/build.py b/Bank_second_part/detect_process/paddlevideo/metrics/build.py
new file mode 100644
index 0000000..82e4b50
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/build.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .registry import METRIC
+from ..utils import build
+
+
+def build_metric(cfg):
+    return build(cfg, METRIC)
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/center_crop_metric.py b/Bank_second_part/detect_process/paddlevideo/metrics/center_crop_metric.py
new file mode 100644
index 0000000..0ca6112
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/center_crop_metric.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from typing import List
+
+import paddle
+from paddlevideo.utils import get_logger
+
+from .base import BaseMetric
+from .registry import METRIC
+
+logger = get_logger("paddlevideo")
+
+
+@METRIC.register
+class CenterCropMetric(BaseMetric):
+    def __init__(self, data_size, batch_size, log_interval=1, **kwargs):
+        """prepare for metrics
+        """
+        super().__init__(data_size, batch_size, log_interval, **kwargs)
+        self.rest_data_size = data_size  # Number of samples remaining to be tested
+        self.all_outputs = []
+        self.all_labels = []
+        self.topk = kwargs.get("topk", [1, 5])
+
+    def update(self, batch_id: int, data: List, outputs: paddle.Tensor) -> None:
+        """update metrics during each iter
+
+        Args:
+            batch_id (int): iter id of current batch.
+            data (List): list of batched data, such as [inputs, labels]
+            outputs (paddle.Tensor): batched outputs from model
+        """
+        labels = data[1]
+        if self.world_size > 1:
+            labels_gathered = self.gather_from_gpu(labels, concat_axis=0)
+            outpus_gathered = self.gather_from_gpu(outputs, concat_axis=0)
+        else:
+            labels_gathered = labels
+            outpus_gathered = outputs
+
+        # Avoid resampling effects when testing with multiple cards
+        labels_gathered = labels_gathered[0:min(len(labels_gathered), self.
+                                                rest_data_size)]
+        outpus_gathered = outpus_gathered[0:min(len(outpus_gathered), self.
+                                                rest_data_size)]
+        self.all_labels.append(labels_gathered)
+        self.all_outputs.append(outpus_gathered)
+        self.rest_data_size -= outpus_gathered.shape[0]
+
+        # preds ensemble
+        if batch_id % self.log_interval == 0:
+            logger.info("[TEST] Processing batch {}/{} ...".format(
+                batch_id,
+                self.data_size // (self.batch_size * self.world_size)))
+
+    def accumulate(self):
+        """accumulate, compute, and show metrics when finished all iters.
+        """
+        self.all_outputs = paddle.concat(self.all_outputs, axis=0)
+        self.all_labels = paddle.concat(self.all_labels, axis=0)
+
+        result_str = []
+        for _k in self.topk:
+            topk_val = paddle.metric.accuracy(input=self.all_outputs,
+                                              label=self.all_labels,
+                                              k=_k).item()
+            result_str.append(f"avg_acc{_k}={topk_val}")
+        result_str = ", ".join(result_str)
+        logger.info(f"[TEST] finished, {result_str}")
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/center_crop_metric_MRI.py b/Bank_second_part/detect_process/paddlevideo/metrics/center_crop_metric_MRI.py
new file mode 100644
index 0000000..b6d231a
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/center_crop_metric_MRI.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+import paddle
+
+from .registry import METRIC
+from .base import BaseMetric
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@METRIC.register
+class CenterCropMetric_MRI(BaseMetric):
+    def __init__(self, data_size, batch_size, log_interval=1, if_slowfast=0):
+        """prepare for metrics
+        """
+        super().__init__(data_size, batch_size, log_interval)
+        self.top1 = []
+        self.if_slowfast = if_slowfast
+
+    def update(self, batch_id, data, outputs):
+        """update metrics during each iter
+        """
+        labels = data[1]
+
+        if self.if_slowfast:
+            labels = data[2]
+
+        top1 = paddle.metric.accuracy(input=outputs, label=labels, k=1)
+        #top5 = paddle.metric.accuracy(input=outputs, label=labels, k=5)
+        #NOTE(shipping): deal with multi cards validate
+        if self.world_size > 1:
+            top1 = paddle.distributed.all_reduce(
+                top1, op=paddle.distributed.ReduceOp.SUM) / self.world_size
+            # top5 = paddle.distributed.all_reduce(
+            #     top5, op=paddle.distributed.ReduceOp.SUM) / self.world_size
+
+        self.top1.append(top1.numpy())
+        #self.top5.append(top5.numpy())
+        # preds ensemble
+        if batch_id % self.log_interval == 0:
+            logger.info("[TEST] Processing batch {}/{} ...".format(
+                batch_id,
+                self.data_size // (self.batch_size * self.world_size)))
+
+    def accumulate(self):
+        """accumulate metrics when finished all iters.
+        """
+        logger.info('[TEST] finished, avg_acc1= {}'.format(
+            np.mean(np.array(self.top1))))
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/depth_metric.py b/Bank_second_part/detect_process/paddlevideo/metrics/depth_metric.py
new file mode 100644
index 0000000..c160e16
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/depth_metric.py
@@ -0,0 +1,77 @@
+import numpy as np
+import paddle
+from paddlevideo.utils import get_logger
+
+from .base import BaseMetric
+from .registry import METRIC
+
+logger = get_logger("paddlevideo")
+
+
+@METRIC.register
+class DepthMetric(BaseMetric):
+    def __init__(self, data_size, batch_size, log_interval=1):
+        """prepare for metrics
+        """
+        super().__init__(data_size, batch_size, log_interval)
+        self.abs_rel = []
+        self.sq_rel = []
+        self.rmse = []
+        self.rmse_log = []
+        self.a1 = []
+        self.a2 = []
+        self.a3 = []
+
+    def update(self, batch_id, data, outputs):
+        """update metrics during each iter
+        """
+        abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 = outputs['abs_rel'], outputs['sq_rel'], outputs['rmse'], \
+                                                      outputs['rmse_log'], outputs['a1'], outputs['a2'],outputs['a3']
+        # preds ensemble
+        if self.world_size > 1:
+            abs_rel = paddle.distributed.all_reduce(
+                outputs['abs_rel'],
+                op=paddle.distributed.ReduceOp.SUM) / self.world_size
+            sq_rel = paddle.distributed.all_reduce(
+                outputs['sq_rel'],
+                op=paddle.distributed.ReduceOp.SUM) / self.world_size
+            rmse = paddle.distributed.all_reduce(
+                outputs['rmse'],
+                op=paddle.distributed.ReduceOp.SUM) / self.world_size
+            rmse_log = paddle.distributed.all_reduce(
+                outputs['rmse_log'],
+                op=paddle.distributed.ReduceOp.SUM) / self.world_size
+            a1 = paddle.distributed.all_reduce(
+                outputs['a1'],
+                op=paddle.distributed.ReduceOp.SUM) / self.world_size
+            a2 = paddle.distributed.all_reduce(
+                outputs['a2'],
+                op=paddle.distributed.ReduceOp.SUM) / self.world_size
+            a3 = paddle.distributed.all_reduce(
+                outputs['a3'],
+                op=paddle.distributed.ReduceOp.SUM) / self.world_size
+
+        self.abs_rel.append(abs_rel)
+        self.sq_rel.append(sq_rel)
+        self.rmse.append(rmse)
+        self.rmse_log.append(rmse_log)
+        self.a1.append(a1)
+        self.a2.append(a2)
+        self.a3.append(a3)
+        if batch_id % self.log_interval == 0:
+            logger.info("[TEST] Processing batch {}/{} ...".format(
+                batch_id,
+                self.data_size // (self.batch_size * self.world_size)))
+
+    def accumulate(self):
+        """accumulate metrics when finished all iters.
+        """
+        logger.info(
+            '[TEST] finished, abs_rel= {}, sq_rel= {} , rmse= {}, rmse_log= {},'
+            'a1= {}, a2= {}, a3= {}'.format(np.mean(np.array(self.abs_rel)),
+                                            np.mean(np.array(self.sq_rel)),
+                                            np.mean(np.array(self.rmse)),
+                                            np.mean(np.array(self.rmse_log)),
+                                            np.mean(np.array(self.a1)),
+                                            np.mean(np.array(self.a2)),
+                                            np.mean(np.array(self.a3))))
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/msrvtt_metric.py b/Bank_second_part/detect_process/paddlevideo/metrics/msrvtt_metric.py
new file mode 100644
index 0000000..99e7334
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/msrvtt_metric.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+from .registry import METRIC
+from .base import BaseMetric
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@METRIC.register
+class MSRVTTMetric(BaseMetric):
+    def __init__(self, data_size, batch_size, log_interval=1):
+        """prepare for metrics
+        """
+        super().__init__(data_size, batch_size, log_interval)
+        self.score_matrix = np.zeros((data_size, data_size))
+        self.target_matrix = np.zeros((data_size, data_size))
+        self.rank_matrix = np.ones((data_size)) * data_size
+
+    def update(self, batch_id, data, outputs):
+        """update metrics during each iter
+        """
+        target = data[-1]
+        cm_logit = outputs[-1]
+
+        self.score_matrix[batch_id, :] = F.softmax(
+            cm_logit, axis=1)[:, 0].reshape([-1]).numpy()
+        self.target_matrix[batch_id, :] = target.reshape([-1]).numpy()
+
+        rank = np.where((np.argsort(-self.score_matrix[batch_id]) == np.where(
+            self.target_matrix[batch_id] == 1)[0][0]) == 1)[0][0]
+        self.rank_matrix[batch_id] = rank
+
+        rank_matrix_tmp = self.rank_matrix[:batch_id + 1]
+        r1 = 100.0 * np.sum(rank_matrix_tmp < 1) / len(rank_matrix_tmp)
+        r5 = 100.0 * np.sum(rank_matrix_tmp < 5) / len(rank_matrix_tmp)
+        r10 = 100.0 * np.sum(rank_matrix_tmp < 10) / len(rank_matrix_tmp)
+
+        medr = np.floor(np.median(rank_matrix_tmp) + 1)
+        meanr = np.mean(rank_matrix_tmp) + 1
+        logger.info(
+            "[{}] Final r1:{:.3f}, r5:{:.3f}, r10:{:.3f}, mder:{:.3f}, meanr:{:.3f}"
+            .format(batch_id, r1, r5, r10, medr, meanr))
+
+    def accumulate(self):
+        """accumulate metrics when finished all iters.
+        """
+        logger.info("Eval Finished!")
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/multi_crop_metric.py b/Bank_second_part/detect_process/paddlevideo/metrics/multi_crop_metric.py
new file mode 100644
index 0000000..5f20ced
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/multi_crop_metric.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+import paddle
+from paddle.hapi.model import _all_gather
+
+from paddlevideo.utils import get_logger
+from .registry import METRIC
+from .base import BaseMetric
+
+logger = get_logger("paddlevideo")
+""" An example for metrics class.
+    MultiCropMetric for slowfast.
+"""
+
+
+@METRIC.register
+class MultiCropMetric(BaseMetric):
+    def __init__(self,
+                 data_size,
+                 batch_size,
+                 num_ensemble_views,
+                 num_spatial_crops,
+                 num_classes,
+                 log_interval=1):
+        """prepare for metrics
+        """
+        super().__init__(data_size, batch_size, log_interval)
+        self.num_ensemble_views = num_ensemble_views
+        self.num_spatial_crops = num_spatial_crops
+        self.num_classes = num_classes
+
+        self.num_clips = self.num_ensemble_views * self.num_spatial_crops
+        num_videos = self.data_size // self.num_clips
+        self.video_preds = np.zeros((num_videos, self.num_classes))
+        self.video_labels = np.zeros((num_videos, 1), dtype="int64")
+        self.clip_count = {}
+
+    def update(self, batch_id, data, outputs):
+        """update metrics during each iter
+        """
+        labels = data[2]
+        clip_ids = data[3]
+
+        # gather mulit card, results of following process in each card is the same.
+        if self.world_size > 1:
+            outputs = _all_gather(outputs, self.world_size)
+            labels = _all_gather(labels.cuda(), self.world_size)
+            clip_ids = _all_gather(clip_ids.cuda(), self.world_size)
+
+        # to numpy
+        preds = outputs.numpy()
+        labels = labels.numpy().astype("int64")
+        clip_ids = clip_ids.numpy()
+
+        # preds ensemble
+        for ind in range(preds.shape[0]):
+            vid_id = int(clip_ids[ind]) // self.num_clips
+            ts_idx = int(clip_ids[ind]) % self.num_clips
+            if vid_id not in self.clip_count:
+                self.clip_count[vid_id] = []
+            if ts_idx in self.clip_count[vid_id]:
+                logger.info(
+                    "[TEST] Passed!! read video {} clip index {} / {} repeatedly."
+                    .format(vid_id, ts_idx, clip_ids[ind]))
+            else:
+                self.clip_count[vid_id].append(ts_idx)
+                self.video_preds[vid_id] += preds[ind]  # ensemble method: sum
+                if self.video_labels[vid_id].sum() > 0:
+                    assert self.video_labels[vid_id] == labels[ind]
+                self.video_labels[vid_id] = labels[ind]
+        if batch_id % self.log_interval == 0:
+            logger.info("[TEST] Processing batch {}/{} ...".format(
+                batch_id,
+                self.data_size // (self.batch_size * self.world_size)))
+
+    def accumulate(self):
+        """accumulate metrics when finished all iters.
+        """
+        # check clip index of each video
+        for key in self.clip_count.keys():
+            if len(self.clip_count[key]) != self.num_clips or sum(
+                    self.clip_count[key]) != self.num_clips * (self.num_clips -
+                                                               1) / 2:
+                logger.info(
+                    "[TEST] Count Error!! video [{}] clip count [{}] not match number clips {}"
+                    .format(key, self.clip_count[key], self.num_clips))
+
+        video_preds = paddle.to_tensor(self.video_preds)
+        video_labels = paddle.to_tensor(self.video_labels)
+        acc_top1 = paddle.metric.accuracy(input=video_preds,
+                                          label=video_labels,
+                                          k=1)
+        acc_top5 = paddle.metric.accuracy(input=video_preds,
+                                          label=video_labels,
+                                          k=5)
+        logger.info('[TEST] finished, avg_acc1= {}, avg_acc5= {} '.format(
+            acc_top1.numpy(), acc_top5.numpy()))
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/recall.py b/Bank_second_part/detect_process/paddlevideo/metrics/recall.py
new file mode 100644
index 0000000..3612e22
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/recall.py
@@ -0,0 +1,84 @@
+import numpy as np
+import paddle 
+
+def _recalls(all_ious, proposal_nums, thrs):
+
+    img_num = all_ious.shape[0]
+    total_gt_num = sum([ious.shape[0] for ious in all_ious])
+
+    ious_ = np.zeros((proposal_nums.size, total_gt_num), dtype=np.float32)
+    for k, proposal_num in enumerate(proposal_nums):
+        tmp_ious = np.zeros(0)
+        for i in range(img_num):
+            ious = all_ious[i][:, :proposal_num].copy()
+            gt_ious = np.zeros(ious.shape[0])
+            if ious.size == 0:
+                tmp_ious = np.hstack((tmp_ious, gt_ious))
+                continue
+            for j in range(ious.shape[0]):
+                gt_max_overlaps = ious.argmax(axis=1)
+                max_ious = ious[np.arange(0, ious.shape[0]), gt_max_overlaps]
+                gt_idx = max_ious.argmax()
+                gt_ious[j] = max_ious[gt_idx]
+                box_idx = gt_max_overlaps[gt_idx]
+                ious[gt_idx, :] = -1
+                ious[:, box_idx] = -1
+            tmp_ious = np.hstack((tmp_ious, gt_ious))
+        ious_[k, :] = tmp_ious
+
+    ious_ = np.fliplr(np.sort(ious_, axis=1))
+    recalls = np.zeros((proposal_nums.size, thrs.size))
+    for i, thr in enumerate(thrs):
+        recalls[:, i] = (ious_ >= thr).sum(axis=1) / float(total_gt_num)
+
+    return recalls
+
+
+def set_recall_param(proposal_nums, iou_thrs):
+    if isinstance(proposal_nums, list):
+        proposal_nums_ = np.array(proposal_nums)
+    elif isinstance(proposal_nums, int):
+        proposal_nums_ = np.array([proposal_nums])
+    else:
+        proposal_nums_ = proposal_nums
+
+    if iou_thrs is None:
+        _iou_thrs = np.array([0.5])
+    elif isinstance(iou_thrs, list):
+        _iou_thrs = np.array(iou_thrs)
+    elif isinstance(iou_thrs, float):
+        _iou_thrs = np.array([iou_thrs])
+    else:
+        _iou_thrs = iou_thrs
+
+    return proposal_nums_, _iou_thrs
+
+
+def eval_recalls(gts, proposals, proposal_nums=None, iou_thrs=None):
+    """Calculate recalls. """
+    img_num = len(gts)
+    assert img_num == len(proposals)
+
+    proposal_nums, iou_thrs = set_recall_param(proposal_nums, iou_thrs)
+
+    all_ious = []
+    for i in range(img_num):
+        if proposals[i].ndim == 2 and proposals[i].shape[1] == 5:
+            scores = proposals[i][:, 4]
+            sort_idx = np.argsort(scores)[::-1]
+            img_proposal = proposals[i][sort_idx, :]
+        else:
+            img_proposal = proposals[i]
+
+        prop_num = min(img_proposal.shape[0], proposal_nums[-1])
+        if gts[i] is None or gts[i].shape[0] == 0:
+            ious = np.zeros((0, img_proposal.shape[0]), dtype=np.float32)
+        else:
+            ious = bbox_overlaps(
+                torch.tensor(gts[i]),
+                torch.tensor(img_proposal[:prop_num, :4]))
+            ious = ious.data.numpy()
+        all_ious.append(ious)
+    all_ious = np.array(all_ious)
+    recalls = _recalls(all_ious, proposal_nums, iou_thrs)
+    return recalls
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/registry.py b/Bank_second_part/detect_process/paddlevideo/metrics/registry.py
new file mode 100644
index 0000000..2214440
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/registry.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils import Registry
+
+METRIC = Registry('metric')
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/segmentation_metric.py b/Bank_second_part/detect_process/paddlevideo/metrics/segmentation_metric.py
new file mode 100644
index 0000000..3719450
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/segmentation_metric.py
@@ -0,0 +1,389 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+import argparse
+import pandas as pd
+
+from .registry import METRIC
+from .base import BaseMetric
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+def get_labels_scores_start_end_time(input_np,
+                                     frame_wise_labels,
+                                     actions_dict,
+                                     bg_class=["background", "None"]):
+    labels = []
+    starts = []
+    ends = []
+    scores = []
+
+    boundary_score_ptr = 0
+
+    last_label = frame_wise_labels[0]
+    if frame_wise_labels[0] not in bg_class:
+        labels.append(frame_wise_labels[0])
+        starts.append(0)
+    for i in range(len(frame_wise_labels)):
+        if frame_wise_labels[i] != last_label:
+            if frame_wise_labels[i] not in bg_class:
+                labels.append(frame_wise_labels[i])
+                starts.append(i)
+            if last_label not in bg_class:
+                ends.append(i)
+                score = np.mean(
+                        input_np[actions_dict[labels[boundary_score_ptr]], \
+                            starts[boundary_score_ptr]:(ends[boundary_score_ptr] + 1)]
+                        )
+                scores.append(score)
+                boundary_score_ptr = boundary_score_ptr + 1
+            last_label = frame_wise_labels[i]
+    if last_label not in bg_class:
+        ends.append(i + 1)
+        score = np.mean(
+                    input_np[actions_dict[labels[boundary_score_ptr]], \
+                        starts[boundary_score_ptr]:(ends[boundary_score_ptr] + 1)]
+                    )
+        scores.append(score)
+        boundary_score_ptr = boundary_score_ptr + 1
+
+    return labels, starts, ends, scores
+
+
+def get_labels_start_end_time(frame_wise_labels,
+                              bg_class=["background", "None"]):
+    labels = []
+    starts = []
+    ends = []
+    last_label = frame_wise_labels[0]
+    if frame_wise_labels[0] not in bg_class:
+        labels.append(frame_wise_labels[0])
+        starts.append(0)
+    for i in range(len(frame_wise_labels)):
+        if frame_wise_labels[i] != last_label:
+            if frame_wise_labels[i] not in bg_class:
+                labels.append(frame_wise_labels[i])
+                starts.append(i)
+            if last_label not in bg_class:
+                ends.append(i)
+            last_label = frame_wise_labels[i]
+    if last_label not in bg_class:
+        ends.append(i + 1)
+    return labels, starts, ends
+
+
+def levenstein(p, y, norm=False):
+    m_row = len(p)
+    n_col = len(y)
+    D = np.zeros([m_row + 1, n_col + 1], np.float)
+    for i in range(m_row + 1):
+        D[i, 0] = i
+    for i in range(n_col + 1):
+        D[0, i] = i
+
+    for j in range(1, n_col + 1):
+        for i in range(1, m_row + 1):
+            if y[j - 1] == p[i - 1]:
+                D[i, j] = D[i - 1, j - 1]
+            else:
+                D[i, j] = min(D[i - 1, j] + 1, D[i, j - 1] + 1,
+                              D[i - 1, j - 1] + 1)
+
+    if norm:
+        score = (1 - D[-1, -1] / max(m_row, n_col)) * 100
+    else:
+        score = D[-1, -1]
+
+    return score
+
+
+def edit_score(recognized,
+               ground_truth,
+               norm=True,
+               bg_class=["background", "None"]):
+    P, _, _ = get_labels_start_end_time(recognized, bg_class)
+    Y, _, _ = get_labels_start_end_time(ground_truth, bg_class)
+    return levenstein(P, Y, norm)
+
+
+def f_score(recognized, ground_truth, overlap, bg_class=["background", "None"]):
+    p_label, p_start, p_end = get_labels_start_end_time(recognized, bg_class)
+    y_label, y_start, y_end = get_labels_start_end_time(ground_truth, bg_class)
+
+    tp = 0
+    fp = 0
+
+    hits = np.zeros(len(y_label))
+
+    for j in range(len(p_label)):
+        intersection = np.minimum(p_end[j], y_end) - np.maximum(
+            p_start[j], y_start)
+        union = np.maximum(p_end[j], y_end) - np.minimum(p_start[j], y_start)
+        IoU = (1.0 * intersection / union) * (
+            [p_label[j] == y_label[x] for x in range(len(y_label))])
+        # Get the best scoring segment
+        idx = np.array(IoU).argmax()
+
+        if IoU[idx] >= overlap and not hits[idx]:
+            tp += 1
+            hits[idx] = 1
+        else:
+            fp += 1
+    fn = len(y_label) - sum(hits)
+    return float(tp), float(fp), float(fn)
+
+
+def boundary_AR(pred_boundary, gt_boundary, overlap_list, max_proposal):
+
+    p_label, p_start, p_end, p_scores = pred_boundary
+    y_label, y_start, y_end, _ = gt_boundary
+
+    # sort proposal
+    pred_dict = {
+        "label": p_label,
+        "start": p_start,
+        "end": p_end,
+        "scores": p_scores
+    }
+    pdf = pd.DataFrame(pred_dict)
+    pdf = pdf.sort_values(by="scores", ascending=False)
+    p_label = list(pdf["label"])
+    p_start = list(pdf["start"])
+    p_end = list(pdf["end"])
+    p_scores = list(pdf["scores"])
+
+    # refine AN
+    if len(p_label) < max_proposal and len(p_label) > 0:
+        p_label = p_label + [p_label[-1]] * (max_proposal - len(p_label))
+        p_start = p_start + [p_start[-1]] * (max_proposal - len(p_start))
+        p_start = p_start + p_start[len(p_start) -
+                                    (max_proposal - len(p_start)):]
+        p_end = p_end + [p_end[-1]] * (max_proposal - len(p_end))
+        p_scores = p_scores + [p_scores[-1]] * (max_proposal - len(p_scores))
+    elif len(p_label) > max_proposal:
+        p_label[max_proposal:] = []
+        p_start[max_proposal:] = []
+        p_end[max_proposal:] = []
+        p_scores[max_proposal:] = []
+
+    t_AR = np.zeros(len(overlap_list))
+
+    for i in range(len(overlap_list)):
+        overlap = overlap_list[i]
+
+        tp = 0
+        fp = 0
+        hits = np.zeros(len(y_label))
+
+        for j in range(len(p_label)):
+            intersection = np.minimum(p_end[j], y_end) - np.maximum(
+                p_start[j], y_start)
+            union = np.maximum(p_end[j], y_end) - np.minimum(
+                p_start[j], y_start)
+            IoU = (1.0 * intersection / union)
+            # Get the best scoring segment
+            idx = np.array(IoU).argmax()
+
+            if IoU[idx] >= overlap and not hits[idx]:
+                tp += 1
+                hits[idx] = 1
+            else:
+                fp += 1
+        fn = len(y_label) - sum(hits)
+
+        recall = float(tp) / (float(tp) + float(fn))
+        t_AR[i] = recall
+
+    AR = np.mean(t_AR)
+    return AR
+
+
+@METRIC.register
+class SegmentationMetric(BaseMetric):
+    """
+    Test for Video Segmentation based model.
+    """
+
+    def __init__(self,
+                 data_size,
+                 batch_size,
+                 overlap,
+                 actions_map_file_path,
+                 log_interval=1,
+                 tolerance=5,
+                 boundary_threshold=0.7,
+                 max_proposal=100):
+        """prepare for metrics
+        """
+        super().__init__(data_size, batch_size, log_interval)
+        # actions dict generate
+        file_ptr = open(actions_map_file_path, 'r')
+        actions = file_ptr.read().split('\n')[:-1]
+        file_ptr.close()
+        self.actions_dict = dict()
+        for a in actions:
+            self.actions_dict[a.split()[1]] = int(a.split()[0])
+
+        # cls score
+        self.overlap = overlap
+        self.overlap_len = len(overlap)
+
+        self.cls_tp = np.zeros(self.overlap_len)
+        self.cls_fp = np.zeros(self.overlap_len)
+        self.cls_fn = np.zeros(self.overlap_len)
+        self.total_correct = 0
+        self.total_edit = 0
+        self.total_frame = 0
+        self.total_video = 0
+
+        # boundary score
+        self.max_proposal = max_proposal
+        self.AR_at_AN = [[] for _ in range(max_proposal)]
+
+    def update(self, batch_id, data, outputs):
+        """update metrics during each iter
+        """
+        groundTruth = data[1]
+
+        predicted = outputs['predict']
+        output_np = outputs['output_np']
+
+        outputs_np = predicted.numpy()
+        outputs_arr = output_np.numpy()[0, :]
+        gt_np = groundTruth.numpy()[0, :]
+
+        recognition = []
+        for i in range(outputs_np.shape[0]):
+            recognition = np.concatenate((recognition, [
+                list(self.actions_dict.keys())[list(
+                    self.actions_dict.values()).index(outputs_np[i])]
+            ]))
+        recog_content = list(recognition)
+
+        gt_content = []
+        for i in range(gt_np.shape[0]):
+            gt_content = np.concatenate((gt_content, [
+                list(self.actions_dict.keys())[list(
+                    self.actions_dict.values()).index(gt_np[i])]
+            ]))
+        gt_content = list(gt_content)
+
+        pred_boundary = get_labels_scores_start_end_time(
+            outputs_arr, recog_content, self.actions_dict)
+        gt_boundary = get_labels_scores_start_end_time(
+            np.ones(outputs_arr.shape), gt_content, self.actions_dict)
+
+        # cls score
+        correct = 0
+        total = 0
+        edit = 0
+
+        for i in range(len(gt_content)):
+            total += 1
+            #accumulate
+            self.total_frame += 1
+
+            if gt_content[i] == recog_content[i]:
+                correct += 1
+                #accumulate
+                self.total_correct += 1
+
+        edit_num = edit_score(recog_content, gt_content)
+        edit += edit_num
+        self.total_edit += edit_num
+
+        for s in range(self.overlap_len):
+            tp1, fp1, fn1 = f_score(recog_content, gt_content, self.overlap[s])
+
+            # accumulate
+            self.cls_tp[s] += tp1
+            self.cls_fp[s] += fp1
+            self.cls_fn[s] += fn1
+
+        # accumulate
+        self.total_video += 1
+
+        # proposal score
+        for AN in range(self.max_proposal):
+            AR = boundary_AR(pred_boundary,
+                             gt_boundary,
+                             self.overlap,
+                             max_proposal=(AN + 1))
+            self.AR_at_AN[AN].append(AR)
+
+    def accumulate(self):
+        """accumulate metrics when finished all iters.
+        """
+        # cls metric
+        Acc = 100 * float(self.total_correct) / self.total_frame
+        Edit = (1.0 * self.total_edit) / self.total_video
+        Fscore = dict()
+        for s in range(self.overlap_len):
+            precision = self.cls_tp[s] / float(self.cls_tp[s] + self.cls_fp[s])
+            recall = self.cls_tp[s] / float(self.cls_tp[s] + self.cls_fn[s])
+
+            f1 = 2.0 * (precision * recall) / (precision + recall)
+
+            f1 = np.nan_to_num(f1) * 100
+            Fscore[self.overlap[s]] = f1
+
+        # proposal metric
+        proposal_AUC = np.array(self.AR_at_AN) * 100
+        AUC = np.mean(proposal_AUC)
+        AR_at_AN1 = np.mean(proposal_AUC[0, :])
+        AR_at_AN5 = np.mean(proposal_AUC[4, :])
+        AR_at_AN15 = np.mean(proposal_AUC[14, :])
+
+        # log metric
+        log_mertic_info = "dataset model performence: "
+        # preds ensemble
+        log_mertic_info += "Acc: {:.4f}, ".format(Acc)
+        log_mertic_info += 'Edit: {:.4f}, '.format(Edit)
+        for s in range(len(self.overlap)):
+            log_mertic_info += 'F1@{:0.2f}: {:.4f}, '.format(
+                self.overlap[s], Fscore[self.overlap[s]])
+
+        # boundary metric
+        log_mertic_info += "Auc: {:.4f}, ".format(AUC)
+        log_mertic_info += "AR@AN1: {:.4f}, ".format(AR_at_AN1)
+        log_mertic_info += "AR@AN5: {:.4f}, ".format(AR_at_AN5)
+        log_mertic_info += "AR@AN15: {:.4f}, ".format(AR_at_AN15)
+        logger.info(log_mertic_info)
+
+        # log metric
+        metric_dict = dict()
+        metric_dict['Acc'] = Acc
+        metric_dict['Edit'] = Edit
+        for s in range(len(self.overlap)):
+            metric_dict['F1@{:0.2f}'.format(
+                self.overlap[s])] = Fscore[self.overlap[s]]
+        metric_dict['Auc'] = AUC
+        metric_dict['AR@AN1'] = AR_at_AN1
+        metric_dict['AR@AN5'] = AR_at_AN5
+        metric_dict['AR@AN15'] = AR_at_AN15
+
+        # clear for next epoch
+        # cls
+        self.cls_tp = np.zeros(self.overlap_len)
+        self.cls_fp = np.zeros(self.overlap_len)
+        self.cls_fn = np.zeros(self.overlap_len)
+        self.total_correct = 0
+        self.total_edit = 0
+        self.total_frame = 0
+        self.total_video = 0
+        # proposal
+        self.AR_at_AN = [[] for _ in range(self.max_proposal)]
+
+        return metric_dict
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/skeleton_metric.py b/Bank_second_part/detect_process/paddlevideo/metrics/skeleton_metric.py
new file mode 100644
index 0000000..7978478
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/skeleton_metric.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+import paddle
+import csv
+import paddle.nn.functional as F
+
+from .registry import METRIC
+from .base import BaseMetric
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@METRIC.register
+class SkeletonMetric(BaseMetric):
+    """
+    Test for Skeleton based model.
+    note: only support batch size = 1, single card test.
+
+    Args:
+        out_file: str, file to save test results.
+    """
+
+    def __init__(self,
+                 data_size,
+                 batch_size,
+                 out_file='submission.csv',
+                 log_interval=1,
+                 top_k=5):
+        """prepare for metrics
+        """
+        super().__init__(data_size, batch_size, log_interval)
+        self.top1 = []
+        self.top5 = []
+        self.values = []
+        self.out_file = out_file
+        self.k = top_k
+
+    def update(self, batch_id, data, outputs):
+        """update metrics during each iter
+        """
+        if data[0].shape[0] != outputs.shape[0]:
+            num_segs = data[0].shape[1]
+            batch_size = outputs.shape[0]
+            outputs = outputs.reshape(
+                [batch_size // num_segs, num_segs, outputs.shape[-1]])
+            outputs = outputs.mean(axis=1)
+        if len(data) == 2:  # data with label
+            labels = data[1]
+            top1 = paddle.metric.accuracy(input=outputs, label=labels, k=1)
+            top5 = paddle.metric.accuracy(input=outputs, label=labels, k=self.k)
+            if self.world_size > 1:
+                top1 = paddle.distributed.all_reduce(
+                    top1, op=paddle.distributed.ReduceOp.SUM) / self.world_size
+                top5 = paddle.distributed.all_reduce(
+                    top5, op=paddle.distributed.ReduceOp.SUM) / self.world_size
+            self.top1.append(top1.numpy())
+            self.top5.append(top5.numpy())
+        else:  # data without label, only support batch_size=1. Used for fsd-10.
+            prob = F.softmax(outputs)
+            clas = paddle.argmax(prob, axis=1).numpy()[0]
+            self.values.append((batch_id, clas))
+
+        # preds ensemble
+        if batch_id % self.log_interval == 0:
+            logger.info("[TEST] Processing batch {}/{} ...".format(
+                batch_id,
+                self.data_size // (self.batch_size * self.world_size)))
+
+    def accumulate(self):
+        """accumulate metrics when finished all iters.
+        """
+        if self.top1:  # data with label
+            logger.info('[TEST] finished, avg_acc1= {}, avg_acc5= {}'.format(
+                np.mean(np.array(self.top1)), np.mean(np.array(self.top5))))
+        else:
+            headers = ['sample_index', 'predict_category']
+            with open(
+                    self.out_file,
+                    'w',
+            ) as fp:
+                writer = csv.writer(fp)
+                writer.writerow(headers)
+                writer.writerows(self.values)
+            logger.info("Results saved in {} !".format(self.out_file))
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/transnetv2_metric.py b/Bank_second_part/detect_process/paddlevideo/metrics/transnetv2_metric.py
new file mode 100644
index 0000000..3370881
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/transnetv2_metric.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import numpy as np
+
+from .registry import METRIC
+from .base import BaseMetric
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+def predictions_to_scenes(predictions):
+    scenes = []
+    t, t_prev, start = -1, 0, 0
+    for i, t in enumerate(predictions):
+        if t_prev == 1 and t == 0:
+            start = i
+        if t_prev == 0 and t == 1 and i != 0:
+            scenes.append([start, i])
+        t_prev = t
+    if t == 0:
+        scenes.append([start, i])
+
+    # just fix if all predictions are 1
+    if len(scenes) == 0:
+        return np.array([[0, len(predictions) - 1]], dtype=np.int32)
+
+    return np.array(scenes, dtype=np.int32)
+
+
+def evaluate_scenes(gt_scenes, pred_scenes, n_frames_miss_tolerance=2):
+    """
+    Adapted from: https://github.com/gyglim/shot-detection-evaluation
+    The original based on: http://imagelab.ing.unimore.it/imagelab/researchActivity.asp?idActivity=19
+
+    n_frames_miss_tolerance:
+        Number of frames it is possible to miss ground truth by, and still being counted as a correct detection.
+
+    Examples of computation with different tolerance margin:
+    n_frames_miss_tolerance = 0
+      pred_scenes: [[0, 5], [6, 9]] -> pred_trans: [[5.5, 5.5]]
+      gt_scenes:   [[0, 5], [6, 9]] -> gt_trans:   [[5.5, 5.5]] -> HIT
+      gt_scenes:   [[0, 4], [5, 9]] -> gt_trans:   [[4.5, 4.5]] -> MISS
+    n_frames_miss_tolerance = 1
+      pred_scenes: [[0, 5], [6, 9]] -> pred_trans: [[5.0, 6.0]]
+      gt_scenes:   [[0, 5], [6, 9]] -> gt_trans:   [[5.0, 6.0]] -> HIT
+      gt_scenes:   [[0, 4], [5, 9]] -> gt_trans:   [[4.0, 5.0]] -> HIT
+      gt_scenes:   [[0, 3], [4, 9]] -> gt_trans:   [[3.0, 4.0]] -> MISS
+    n_frames_miss_tolerance = 2
+      pred_scenes: [[0, 5], [6, 9]] -> pred_trans: [[4.5, 6.5]]
+      gt_scenes:   [[0, 5], [6, 9]] -> gt_trans:   [[4.5, 6.5]] -> HIT
+      gt_scenes:   [[0, 4], [5, 9]] -> gt_trans:   [[3.5, 5.5]] -> HIT
+      gt_scenes:   [[0, 3], [4, 9]] -> gt_trans:   [[2.5, 4.5]] -> HIT
+      gt_scenes:   [[0, 2], [3, 9]] -> gt_trans:   [[1.5, 3.5]] -> MISS
+
+      Users should be careful about adopting these functions in any commercial matters.
+    """
+
+    shift = n_frames_miss_tolerance / 2
+    gt_scenes = gt_scenes.astype(np.float32) + np.array([[-0.5 + shift, 0.5 - shift]])
+    pred_scenes = pred_scenes.astype(np.float32) + np.array([[-0.5 + shift, 0.5 - shift]])
+
+    gt_trans = np.stack([gt_scenes[:-1, 1], gt_scenes[1:, 0]], 1)
+    pred_trans = np.stack([pred_scenes[:-1, 1], pred_scenes[1:, 0]], 1)
+
+    i, j = 0, 0
+    tp, fp, fn = 0, 0, 0
+
+    while i < len(gt_trans) or j < len(pred_trans):
+        if j == len(pred_trans) or pred_trans[j, 0] > gt_trans[i, 1]:
+            fn += 1
+            i += 1
+        elif i == len(gt_trans) or pred_trans[j, 1] < gt_trans[i, 0]:
+            fp += 1
+            j += 1
+        else:
+            i += 1
+            j += 1
+            tp += 1
+
+    if tp + fp != 0:
+        p = tp / (tp + fp)
+    else:
+        p = 0
+
+    if tp + fn != 0:
+        r = tp / (tp + fn)
+    else:
+        r = 0
+
+    if p + r != 0:
+        f1 = (p * r * 2) / (p + r)
+    else:
+        f1 = 0
+
+    assert tp + fn == len(gt_trans)
+    assert tp + fp == len(pred_trans)
+
+    return p, r, f1, (tp, fp, fn)
+
+
+def create_scene_based_summaries(one_hot_pred, one_hot_gt):
+    thresholds = np.array([
+        0.02, 0.06, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9
+    ])
+    precision, recall, f1, tp, fp, fn = np.zeros_like(thresholds), np.zeros_like(thresholds),\
+                                        np.zeros_like(thresholds), np.zeros_like(thresholds),\
+                                        np.zeros_like(thresholds), np.zeros_like(thresholds)
+
+    gt_scenes = predictions_to_scenes(one_hot_gt)
+    for i in range(len(thresholds)):
+        pred_scenes = predictions_to_scenes(
+            (one_hot_pred > thresholds[i]).astype(np.uint8)
+        )
+        precision[i], recall[i], f1[i], (tp[i], fp[i], fn[i]) = evaluate_scenes(gt_scenes, pred_scenes)
+
+    best_idx = np.argmax(f1)
+
+    return f1[best_idx]
+
+
+@METRIC.register
+class TransNetV2Metric(BaseMetric):
+    def __init__(self, data_size, batch_size, log_interval=1):
+        """prepare for metrics
+        """
+        super().__init__(data_size, batch_size, log_interval)
+        self.predictions = []
+        self.total_stats = {"tp": 0, "fp": 0, "fn": 0}
+
+    def update(self, batch_id, data, one_hot):
+        """update metrics during each iter
+        """
+        if isinstance(one_hot, tuple):
+            one_hot = one_hot[0]
+        one_hot = paddle.nn.functional.sigmoid(one_hot)[0]
+        self.predictions.append(one_hot.numpy()[25:75])
+        gt_scenes = data[1]
+        is_new_file = data[2]
+        if is_new_file:
+            self.compute(gt_scenes)
+        # preds ensemble
+        if batch_id % self.log_interval == 0:
+            logger.info("[TEST] Processing batch {}/{} ...".format(
+                batch_id,
+                self.data_size // (self.batch_size * self.world_size)))
+
+    def compute(self, gt_scenes):
+        predictions = np.concatenate(self.predictions, 0)[:len(frames)]
+        _, _, _, (tp, fp, fn), fp_mistakes, fn_mistakes = evaluate_scenes(
+            gt_scenes, predictions_to_scenes((predictions >= args.thr).astype(np.uint8)))
+
+        self.total_stats["tp"] += tp
+        self.total_stats["fp"] += fp
+        self.total_stats["fn"] += fn
+
+    def accumulate(self):
+        """accumulate metrics when finished all iters.
+        """
+        p = self.total_stats["tp"] / (self.total_stats["tp"] + self.total_stats["fp"])
+        r = self.total_stats["tp"] / (self.total_stats["tp"] + self.total_stats["fn"])
+        f1 = (p * r * 2) / (p + r)
+        logger.info('[TEST] finished, Precision= {:5.2f}, Recall= {:5.2f} , F1 Score= {:5.2f} '.format(
+            p * 100, r * 100, f1 * 100))
\ No newline at end of file
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ucf24_utils.py b/Bank_second_part/detect_process/paddlevideo/metrics/ucf24_utils.py
new file mode 100644
index 0000000..6552645
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/ucf24_utils.py
@@ -0,0 +1,783 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Forked from: https://github.com/rafaelpadilla/Object-Detection-Metrics
+# Developed by: Rafael Padilla (rafael.padilla@smt.ufrj.br)
+
+import glob
+import os
+import shutil
+import sys
+from collections import Counter
+import numpy as np
+from enum import Enum
+import cv2
+
+
+class MethodAveragePrecision(Enum):
+    """
+    Class representing if the coordinates are relative to the
+    image size or are absolute values.
+
+        Developed by: Rafael Padilla
+        Last modification: Apr 28 2018
+    """
+    EveryPointInterpolation = 1
+    ElevenPointInterpolation = 2
+
+
+class CoordinatesType(Enum):
+    """
+    Class representing if the coordinates are relative to the
+    image size or are absolute values.
+
+        Developed by: Rafael Padilla
+        Last modification: Apr 28 2018
+    """
+    Relative = 1
+    Absolute = 2
+
+
+class BBType(Enum):
+    """
+    Class representing if the bounding box is groundtruth or not.
+
+        Developed by: Rafael Padilla
+        Last modification: May 24 2018
+    """
+    GroundTruth = 1
+    Detected = 2
+
+
+class BBFormat(Enum):
+    """
+    Class representing the format of a bounding box.
+    It can be (X,Y,width,height) => XYWH
+    or (X1,Y1,X2,Y2) => XYX2Y2
+        Developed by: Rafael Padilla
+        Last modification: May 24 2018
+    """
+    XYWH = 1
+    XYX2Y2 = 2
+
+
+def convertToRelativeValues(size, box):
+    dw = 1. / (size[0])
+    dh = 1. / (size[1])
+    cx = (box[1] + box[0]) / 2.0
+    cy = (box[3] + box[2]) / 2.0
+    w = box[1] - box[0]
+    h = box[3] - box[2]
+    x = cx * dw
+    y = cy * dh
+    w = w * dw
+    h = h * dh
+    return x, y, w, h
+
+
+def convertToAbsoluteValues(size, box):
+    xIn = round(((2 * float(box[0]) - float(box[2])) * size[0] / 2))
+    yIn = round(((2 * float(box[1]) - float(box[3])) * size[1] / 2))
+    xEnd = xIn + round(float(box[2]) * size[0])
+    yEnd = yIn + round(float(box[3]) * size[1])
+    if xIn < 0:
+        xIn = 0
+    if yIn < 0:
+        yIn = 0
+    if xEnd >= size[0]:
+        xEnd = size[0] - 1
+    if yEnd >= size[1]:
+        yEnd = size[1] - 1
+    return xIn, yIn, xEnd, yEnd
+
+
+def add_bb_into_image(image, bb, color=(255, 0, 0), thickness=2, label=None):
+    r = int(color[0])
+    g = int(color[1])
+    b = int(color[2])
+
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    fontScale = 0.5
+    fontThickness = 1
+
+    x1, y1, x2, y2 = bb.getAbsoluteBoundingBox(BBFormat.XYX2Y2)
+    x1 = int(x1)
+    y1 = int(y1)
+    x2 = int(x2)
+    y2 = int(y2)
+    cv2.rectangle(image, (x1, y1), (x2, y2), (b, g, r), thickness)
+    # Add label
+    if label is not None:
+        # Get size of the text box
+        (tw, th) = cv2.getTextSize(label, font, fontScale, fontThickness)[0]
+        # Top-left coord of the textbox
+        (xin_bb, yin_bb) = (x1 + thickness, y1 - th + int(12.5 * fontScale))
+        # Checking position of the text top-left (outside or inside the bb)
+        if yin_bb - th <= 0:  # if outside the image
+            yin_bb = y1 + th  # put it inside the bb
+        r_Xin = x1 - int(thickness / 2)
+        r_Yin = y1 - th - int(thickness / 2)
+        # Draw filled rectangle to put the text in it
+        cv2.rectangle(image, (r_Xin, r_Yin - thickness),
+                      (r_Xin + tw + thickness * 3, r_Yin + th + int(12.5 * fontScale)), (b, g, r),
+                      -1)
+        cv2.putText(image, label, (xin_bb, yin_bb), font, fontScale, (0, 0, 0), fontThickness,
+                    cv2.LINE_AA)
+    return image
+
+
+class BoundingBox:
+    def __init__(self,
+                 imageName,
+                 classId,
+                 x,
+                 y,
+                 w,
+                 h,
+                 typeCoordinates=None,
+                 imgSize=None,
+                 bbType=None,
+                 classConfidence=None,
+                 format=None):
+        """Constructor.
+        Args:
+            imageName: String representing the image name.
+            classId: String value representing class id.
+            x: Float value representing the X upper-left coordinate of the bounding box.
+            y: Float value representing the Y upper-left coordinate of the bounding box.
+            w: Float value representing the width bounding box.
+            h: Float value representing the height bounding box.
+            typeCoordinates: (optional) Enum (Relative or Absolute) represents if the bounding box
+            coordinates (x,y,w,h) are absolute or relative to size of the image. Default:'Absolute'.
+            imgSize: (optional) 2D vector (width, height)=>(int, int) represents the size of the
+            image of the bounding box. If typeCoordinates is 'Relative', imgSize is required.
+            bbType: (optional) Enum (Groundtruth or Detection) identifies if the bounding box
+            represents a ground truth or a detection. If it is a detection, the classConfidence has
+            to be informed.
+            classConfidence: (optional) Float value representing the confidence of the detected
+            class. If detectionType is Detection, classConfidence needs to be informed.
+            format: (optional) Enum (BBFormat.XYWH or BBFormat.XYX2Y2) indicating the format of the
+            coordinates of the bounding boxes. BBFormat.XYWH: <left> <top> <width> <height>
+            BBFormat.XYX2Y2: <left> <top> <right> <bottom>.
+        """
+        self._imageName = imageName
+        self._typeCoordinates = typeCoordinates
+        if typeCoordinates == CoordinatesType.Relative and imgSize is None:
+            raise IOError(
+                'Parameter \'imgSize\' is required. It is necessary to inform the image size.')
+        if bbType == BBType.Detected and classConfidence is None:
+            raise IOError(
+                'For bbType=\'Detection\', it is necessary to inform the classConfidence value.')
+
+        self._classConfidence = classConfidence
+        self._bbType = bbType
+        self._classId = classId
+        self._format = format
+
+        # If relative coordinates, convert to absolute values
+        # For relative coords: (x,y,w,h)=(X_center/img_width , Y_center/img_height)
+        if typeCoordinates == CoordinatesType.Relative:
+            (self._x, self._y, self._w, self._h) = convertToAbsoluteValues(imgSize, (x, y, w, h))
+            self._width_img = imgSize[0]
+            self._height_img = imgSize[1]
+            if format == BBFormat.XYWH:
+                self._x2 = self._w
+                self._y2 = self._h
+                self._w = self._x2 - self._x
+                self._h = self._y2 - self._y
+            else:
+                raise IOError(
+                    'For relative coordinates, the format must be XYWH (x,y,width,height)')
+        # For absolute coords: (x,y,w,h)=real bb coords
+        else:
+            self._x = x
+            self._y = y
+            if format == BBFormat.XYWH:
+                self._w = w
+                self._h = h
+                self._x2 = self._x + self._w
+                self._y2 = self._y + self._h
+            else:  # format == BBFormat.XYX2Y2: <left> <top> <right> <bottom>.
+                self._x2 = w
+                self._y2 = h
+                self._w = self._x2 - self._x
+                self._h = self._y2 - self._y
+        if imgSize is None:
+            self._width_img = None
+            self._height_img = None
+        else:
+            self._width_img = imgSize[0]
+            self._height_img = imgSize[1]
+
+    def getAbsoluteBoundingBox(self, format=None):
+        if format == BBFormat.XYWH:
+            return self._x, self._y, self._w, self._h
+        elif format == BBFormat.XYX2Y2:
+            return self._x, self._y, self._x2, self._y2
+
+    def getRelativeBoundingBox(self, imgSize=None):
+        if imgSize is None and self._width_img is None and self._height_img is None:
+            raise IOError(
+                'Parameter \'imgSize\' is required. It is necessary to inform the image size.')
+        if imgSize is None:
+            return convertToRelativeValues((imgSize[0], imgSize[1]),
+                                           (self._x, self._y, self._w, self._h))
+        else:
+            return convertToRelativeValues((self._width_img, self._height_img),
+                                           (self._x, self._y, self._w, self._h))
+
+    def getImageName(self):
+        return self._imageName
+
+    def getConfidence(self):
+        return self._classConfidence
+
+    def getFormat(self):
+        return self._format
+
+    def getClassId(self):
+        return self._classId
+
+    def getImageSize(self):
+        return self._width_img, self._height_img
+
+    def getCoordinatesType(self):
+        return self._typeCoordinates
+
+    def getBBType(self):
+        return self._bbType
+
+    @staticmethod
+    def compare(det1, det2):
+        det1BB = det1.getAbsoluteBoundingBox(format=BBFormat.XYWH)
+        det1ImgSize = det1.getImageSize()
+        det2BB = det2.getAbsoluteBoundingBox(format=BBFormat.XYWH)
+        det2ImgSize = det2.getImageSize()
+
+        if det1.getClassId() == det2.getClassId() and \
+                det1.classConfidence == det2.classConfidenc() and \
+                det1BB[0] == det2BB[0] and \
+                det1BB[1] == det2BB[1] and \
+                det1BB[2] == det2BB[2] and \
+                det1BB[3] == det2BB[3] and \
+                det1ImgSize[0] == det1ImgSize[0] and \
+                det2ImgSize[1] == det2ImgSize[1]:
+            return True
+        return False
+
+    @staticmethod
+    def clone(boundingBox):
+        absBB = boundingBox.getAbsoluteBoundingBox(format=BBFormat.XYWH)
+        newBoundingBox = BoundingBox(
+            boundingBox.getImageName(),
+            boundingBox.getClassId(),
+            absBB[0],
+            absBB[1],
+            absBB[2],
+            absBB[3],
+            typeCoordinates=boundingBox.getCoordinatesType(),
+            imgSize=boundingBox.getImageSize(),
+            bbType=boundingBox.getBBType(),
+            classConfidence=boundingBox.getConfidence(),
+            format=BBFormat.XYWH)
+        return newBoundingBox
+
+
+class BoundingBoxes:
+    def __init__(self):
+        self._boundingBoxes = []
+
+    def addBoundingBox(self, bb):
+        self._boundingBoxes.append(bb)
+
+    def removeBoundingBox(self, _boundingBox):
+        for d in self._boundingBoxes:
+            if BoundingBox.compare(d, _boundingBox):
+                del self._boundingBoxes[d]
+                return
+
+    def removeAllBoundingBoxes(self):
+        self._boundingBoxes = []
+
+    def getBoundingBoxes(self):
+        return self._boundingBoxes
+
+    def getBoundingBoxByClass(self, classId):
+        boundingBoxes = []
+        for d in self._boundingBoxes:
+            if d.getClassId() == classId:  # get only specified bounding box type
+                boundingBoxes.append(d)
+        return boundingBoxes
+
+    def getClasses(self):
+        classes = []
+        for d in self._boundingBoxes:
+            c = d.getClassId()
+            if c not in classes:
+                classes.append(c)
+        return classes
+
+    def getBoundingBoxesByType(self, bbType):
+        # get only specified bb type
+        return [d for d in self._boundingBoxes if d.getBBType() == bbType]
+
+    def getBoundingBoxesByImageName(self, imageName):
+        # get only specified bb type
+        return [d for d in self._boundingBoxes if d.getImageName() == imageName]
+
+    def count(self, bbType=None):
+        if bbType is None:  # Return all bounding boxes
+            return len(self._boundingBoxes)
+        count = 0
+        for d in self._boundingBoxes:
+            if d.getBBType() == bbType:  # get only specified bb type
+                count += 1
+        return count
+
+    def clone(self):
+        newBoundingBoxes = BoundingBoxes()
+        for d in self._boundingBoxes:
+            det = BoundingBox.clone(d)
+            newBoundingBoxes.addBoundingBox(det)
+        return newBoundingBoxes
+
+    def drawAllBoundingBoxes(self, image, imageName):
+        bbxes = self.getBoundingBoxesByImageName(imageName)
+        for bb in bbxes:
+            if bb.getBBType() == BBType.GroundTruth:  # if ground truth
+                image = add_bb_into_image(image, bb, color=(0, 255, 0))  # green
+            else:  # if detection
+                image = add_bb_into_image(image, bb, color=(255, 0, 0))  # red
+        return image
+
+
+class Evaluator:
+    def GetPascalVOCMetrics(self,
+                            boundingboxes,
+                            IOUThreshold=0.5,
+                            method=None):
+        """Get the metrics used by the VOC Pascal 2012 challenge.
+        Get
+        Args:
+            boundingboxes: Object of the class BoundingBoxes representing ground truth and detected
+            bounding boxes;
+            IOUThreshold: IOU threshold indicating which detections will be considered TP or FP
+            (default value = 0.5);
+            method (default = EveryPointInterpolation): It can be calculated as the implementation
+            in the official PASCAL VOC toolkit (EveryPointInterpolation), or applying the 11-point
+            interpolatio as described in the paper "The PASCAL Visual Object Classes(VOC) Challenge"
+            or EveryPointInterpolation"  (ElevenPointInterpolation);
+        Returns:
+            A list of dictionaries. Each dictionary contains information and metrics of each class.
+            The keys of each dictionary are:
+            dict['class']: class representing the current dictionary;
+            dict['precision']: array with the precision values;
+            dict['recall']: array with the recall values;
+            dict['AP']: average precision;
+            dict['interpolated precision']: interpolated precision values;
+            dict['interpolated recall']: interpolated recall values;
+            dict['total positives']: total number of ground truth positives;
+            dict['total TP']: total number of True Positive detections;
+            dict['total FP']: total number of False Negative detections;
+        """
+        ret = []  # list containing metrics (precision, recall, average precision) of each class
+        # List with all ground truths (Ex: [imageName,class,confidence=1, (bb coordinates XYX2Y2)])
+        groundTruths = []
+        # List with all detections (Ex: [imageName,class,confidence,(bb coordinates XYX2Y2)])
+        detections = []
+        # Get all classes
+        classes = []
+        # Loop through all bounding boxes and separate them into GTs and detections
+        for bb in boundingboxes.getBoundingBoxes():
+            # [imageName, class, confidence, (bb coordinates XYX2Y2)]
+            if bb.getBBType() == BBType.GroundTruth:
+                groundTruths.append([
+                    bb.getImageName(),
+                    bb.getClassId(), 1,
+                    bb.getAbsoluteBoundingBox(BBFormat.XYX2Y2)
+                ])
+            else:
+                detections.append([
+                    bb.getImageName(),
+                    bb.getClassId(),
+                    bb.getConfidence(),
+                    bb.getAbsoluteBoundingBox(BBFormat.XYX2Y2)
+                ])
+            # get class
+            if bb.getClassId() not in classes:
+                classes.append(bb.getClassId())
+        classes = sorted(classes)
+        # Precision x Recall is obtained individually by each class
+        # Loop through by classes
+        for c in classes:
+            # Get only detection of class c
+            dects = []
+            [dects.append(d) for d in detections if d[1] == c]
+            # Get only ground truths of class c
+            gts = []
+            [gts.append(g) for g in groundTruths if g[1] == c]
+            npos = len(gts)
+            # sort detections by decreasing confidence
+            dects = sorted(dects, key=lambda conf: conf[2], reverse=True)
+            TP = np.zeros(len(dects))
+            FP = np.zeros(len(dects))
+            # create dictionary with amount of gts for each image
+            det = Counter([cc[0] for cc in gts])
+            for key, val in det.items():
+                det[key] = np.zeros(val)
+            # Loop through detections
+            for d in range(len(dects)):
+                # Find ground truth image
+                gt = [gt for gt in gts if gt[0] == dects[d][0]]
+                iouMax = sys.float_info.min
+                for j in range(len(gt)):
+                    iou = Evaluator.iou(dects[d][3], gt[j][3])
+                    if iou > iouMax:
+                        iouMax = iou
+                        jmax = j
+                # Assign detection as true positive/don't care/false positive
+                if iouMax >= IOUThreshold:
+                    if det[dects[d][0]][jmax] == 0:
+                        TP[d] = 1  # count as true positive
+                        det[dects[d][0]][jmax] = 1  # flag as already 'seen'
+                    else:
+                        FP[d] = 1  # count as false positive
+                # - A detected "cat" is overlaped with a GT "cat" with IOU >= IOUThreshold.
+                else:
+                    FP[d] = 1  # count as false positive
+            # compute precision, recall and average precision
+            acc_FP = np.cumsum(FP)
+            acc_TP = np.cumsum(TP)
+            rec = acc_TP / npos
+            prec = np.divide(acc_TP, (acc_FP + acc_TP))
+            # Depending on the method, call the right implementation
+            if method == MethodAveragePrecision.EveryPointInterpolation:
+                [ap, mpre, mrec, ii] = Evaluator.CalculateAveragePrecision(rec, prec)
+            else:
+                [ap, mpre, mrec, _] = Evaluator.ElevenPointInterpolatedAP(rec, prec)
+            # add class result in the dictionary to be returned
+            r = {
+                'class': c,
+                'precision': prec,
+                'recall': rec,
+                'AP': ap,
+                'interpolated precision': mpre,
+                'interpolated recall': mrec,
+                'total positives': npos,
+                'total TP': np.sum(TP),
+                'total FP': np.sum(FP)
+            }
+            ret.append(r)
+        return ret
+
+    @staticmethod
+    def CalculateAveragePrecision(rec, prec):
+        mrec = [0]
+        [mrec.append(e) for e in rec]
+        mrec.append(1)
+        mpre = [0]
+        [mpre.append(e) for e in prec]
+        mpre.append(0)
+        for i in range(len(mpre) - 1, 0, -1):
+            mpre[i - 1] = max(mpre[i - 1], mpre[i])
+        ii = []
+        for i in range(len(mrec) - 1):
+            if mrec[1:][i] != mrec[0:-1][i]:
+                ii.append(i + 1)
+        ap = 0
+        for i in ii:
+            ap = ap + np.sum((mrec[i] - mrec[i - 1]) * mpre[i])
+        return [ap, mpre[0:len(mpre) - 1], mrec[0:len(mpre) - 1], ii]
+
+    @staticmethod
+    # 11-point interpolated average precision
+    def ElevenPointInterpolatedAP(rec, prec):
+        mrec = []
+        [mrec.append(e) for e in rec]
+        mpre = []
+        [mpre.append(e) for e in prec]
+        recallValues = np.linspace(0, 1, 11)
+        recallValues = list(recallValues[::-1])
+        rhoInterp = []
+        recallValid = []
+        for r in recallValues:
+            # Obtain all recall values higher or equal than r
+            argGreaterRecalls = np.argwhere(mrec[:] >= r)
+            pmax = 0
+            # If there are recalls above r
+            if argGreaterRecalls.size != 0:
+                pmax = max(mpre[argGreaterRecalls.min():])
+            recallValid.append(r)
+            rhoInterp.append(pmax)
+        # By definition AP = sum(max(precision whose recall is above r))/11
+        ap = sum(rhoInterp) / 11
+        # Generating values for the plot
+        rvals = [recallValid[0]]
+        [rvals.append(e) for e in recallValid]
+        rvals.append(0)
+        pvals = [0]
+        [pvals.append(e) for e in rhoInterp]
+        pvals.append(0)
+        # rhoInterp = rhoInterp[::-1]
+        cc = []
+        for i in range(len(rvals)):
+            p = (rvals[i], pvals[i - 1])
+            if p not in cc:
+                cc.append(p)
+            p = (rvals[i], pvals[i])
+            if p not in cc:
+                cc.append(p)
+        recallValues = [i[0] for i in cc]
+        rhoInterp = [i[1] for i in cc]
+        return [ap, rhoInterp, recallValues, None]
+
+    # For each detections, calculate IOU with reference
+    @staticmethod
+    def _getAllIOUs(reference, detections):
+        ret = []
+        bbReference = reference.getAbsoluteBoundingBox(BBFormat.XYX2Y2)
+        # img = np.zeros((200,200,3), np.uint8)
+        for d in detections:
+            bb = d.getAbsoluteBoundingBox(BBFormat.XYX2Y2)
+            iou = Evaluator.iou(bbReference, bb)
+            ret.append((iou, reference, d))  # iou, reference, detection
+        return sorted(ret, key=lambda i: i[0], reverse=True)  # sort by iou (from highest to lowest)
+
+    @staticmethod
+    def iou(boxA, boxB):
+        # if boxes dont intersect
+        if Evaluator._boxesIntersect(boxA, boxB) is False:
+            return 0
+        interArea = Evaluator._getIntersectionArea(boxA, boxB)
+        union = Evaluator._getUnionAreas(boxA, boxB, interArea=interArea)
+        # intersection over union
+        iou = interArea / union
+        assert iou >= 0
+        return iou
+
+    @staticmethod
+    def _boxesIntersect(boxA, boxB):
+        if boxA[0] > boxB[2]:
+            return False  # boxA is right of boxB
+        if boxB[0] > boxA[2]:
+            return False  # boxA is left of boxB
+        if boxA[3] < boxB[1]:
+            return False  # boxA is above boxB
+        if boxA[1] > boxB[3]:
+            return False  # boxA is below boxB
+        return True
+
+    @staticmethod
+    def _getIntersectionArea(boxA, boxB):
+        xA = max(boxA[0], boxB[0])
+        yA = max(boxA[1], boxB[1])
+        xB = min(boxA[2], boxB[2])
+        yB = min(boxA[3], boxB[3])
+        # intersection area
+        return (xB - xA + 1) * (yB - yA + 1)
+
+    @staticmethod
+    def _getUnionAreas(boxA, boxB, interArea=None):
+        area_A = Evaluator._getArea(boxA)
+        area_B = Evaluator._getArea(boxB)
+        if interArea is None:
+            interArea = Evaluator._getIntersectionArea(boxA, boxB)
+        return float(area_A + area_B - interArea)
+
+    @staticmethod
+    def _getArea(box):
+        return (box[2] - box[0] + 1) * (box[3] - box[1] + 1)
+
+
+# Validate formats
+def ValidateFormats(argFormat, argName, errors):
+    if argFormat == 'xywh':
+        return BBFormat.XYWH
+    elif argFormat == 'xyrb':
+        return BBFormat.XYX2Y2
+    elif argFormat is None:
+        return BBFormat.XYWH  # default when nothing is passed
+    else:
+        errors.append(
+            'argument %s: invalid value. It must be either \'xywh\' or \'xyrb\'' % argName)
+
+
+# Validate mandatory args
+def ValidateMandatoryArgs(arg, argName, errors):
+    if arg is None:
+        errors.append('argument %s: required argument' % argName)
+    else:
+        return True
+
+
+def ValidateImageSize(arg, argName, argInformed, errors):
+    errorMsg = 'argument %s: required argument if %s is relative' % (argName, argInformed)
+    ret = None
+    if arg is None:
+        errors.append(errorMsg)
+    else:
+        arg = arg.replace('(', '').replace(')', '')
+        args = arg.split(',')
+        if len(args) != 2:
+            errors.append(
+                '%s. It must be in the format \'width,height\' (e.g. \'600,400\')' % errorMsg)
+        else:
+            if not args[0].isdigit() or not args[1].isdigit():
+                errors.append(
+                    '%s. It must be in INdiaTEGER the format \'width,height\' (e.g. \'600,400\')' %
+                    errorMsg)
+            else:
+                ret = (int(args[0]), int(args[1]))
+    return ret
+
+
+# Validate coordinate types
+def ValidateCoordinatesTypes(arg, argName, errors):
+    if arg == 'abs':
+        return CoordinatesType.Absolute
+    elif arg == 'rel':
+        return CoordinatesType.Relative
+    elif arg is None:
+        return CoordinatesType.Absolute  # default when nothing is passed
+    errors.append('argument %s: invalid value. It must be either \'rel\' or \'abs\'' % argName)
+
+
+def getBoundingBoxes(directory,
+                     isGT,
+                     bbFormat,
+                     coordType,
+                     allBoundingBoxes=None,
+                     allClasses=None,
+                     imgSize=(0, 0)):
+    """Read txt files containing bounding boxes (ground truth and detections)."""
+    print(directory)
+    if allBoundingBoxes is None:
+        allBoundingBoxes = BoundingBoxes()
+    if allClasses is None:
+        allClasses = []
+    # Read ground truths
+    os.chdir(directory)
+    files = glob.glob("*.txt")
+    files.sort()
+
+    for f in files:
+        nameOfImage = f.replace(".txt", "")
+        fh1 = open(f, "r")
+        for line in fh1:
+            line = line.replace("\n", "")
+            if line.replace(' ', '') == '':
+                continue
+            splitLine = line.split(" ")
+            if isGT:
+                idClass = (splitLine[0])  # class
+                x = float(splitLine[1])
+                y = float(splitLine[2])
+                w = float(splitLine[3])
+                h = float(splitLine[4])
+                bb = BoundingBox(
+                    nameOfImage,
+                    idClass,
+                    x,
+                    y,
+                    w,
+                    h,
+                    coordType,
+                    imgSize,
+                    BBType.GroundTruth,
+                    format=bbFormat)
+            else:
+                idClass = (splitLine[0])  # class
+                confidence = float(splitLine[1])
+                x = float(splitLine[2])
+                y = float(splitLine[3])
+                w = float(splitLine[4])
+                h = float(splitLine[5])
+                bb = BoundingBox(
+                    nameOfImage,
+                    idClass,
+                    x,
+                    y,
+                    w,
+                    h,
+                    coordType,
+                    imgSize,
+                    BBType.Detected,
+                    confidence,
+                    format=bbFormat)
+            allBoundingBoxes.addBoundingBox(bb)
+            if idClass not in allClasses:
+                allClasses.append(idClass)
+        fh1.close()
+    return allBoundingBoxes, allClasses
+
+
+def get_mAP(gtFolder, detFolder, threshold=0.5, savePath=None):
+    gtFormat = 'xyrb'
+    detFormat = 'xyrb'
+    gtCoordinates = 'abs'
+    detCoordinates = 'abs'
+    gtFolder = os.path.join(os.path.abspath('.'), gtFolder)
+    detFolder = os.path.join(os.path.abspath('.'), detFolder)
+
+    iouThreshold = threshold
+
+    # Arguments validation
+    errors = []
+    # Validate formats
+    gtFormat = ValidateFormats(gtFormat, 'gtFormat', errors)
+    detFormat = ValidateFormats(detFormat, '-detformat', errors)
+
+    # Coordinates types
+    gtCoordType = ValidateCoordinatesTypes(gtCoordinates, '-gtCoordinates', errors)
+    detCoordType = ValidateCoordinatesTypes(detCoordinates, '-detCoordinates', errors)
+    imgSize = (0, 0)
+
+    # Create directory to save results
+    shutil.rmtree(savePath, ignore_errors=True)  # Clear folder
+    if savePath is not None:
+        os.makedirs(savePath)
+
+    # Get groundtruth boxes
+    allBoundingBoxes, allClasses = getBoundingBoxes(
+        gtFolder, True, gtFormat, gtCoordType, imgSize=imgSize)
+    # Get detected boxes
+    allBoundingBoxes, allClasses = getBoundingBoxes(
+        detFolder, False, detFormat, detCoordType, allBoundingBoxes, allClasses, imgSize=imgSize)
+    allClasses.sort()
+
+    evaluator = Evaluator()
+    acc_AP = 0
+    validClasses = 0
+
+    # Plot Precision x Recall curve
+    detections = evaluator.GetPascalVOCMetrics(allBoundingBoxes, iouThreshold,
+                                               method=MethodAveragePrecision.EveryPointInterpolation)
+
+    # each detection is a class and store AP and mAP results in AP_res list
+    AP_res = []
+    for metricsPerClass in detections:
+        # Get metric values per each class
+        cl = metricsPerClass['class']
+        ap = metricsPerClass['AP']
+        totalPositives = metricsPerClass['total positives']
+
+        if totalPositives > 0:
+            validClasses = validClasses + 1
+            acc_AP = acc_AP + ap
+            ap_str = "{0:.2f}%".format(ap * 100)
+            AP_res.append('AP: %s (%s)' % (ap_str, cl))
+    mAP = acc_AP / validClasses
+    mAP_str = "{0:.2f}%".format(mAP * 100)
+    AP_res.append('mAP: %s' % mAP_str)
+    return AP_res
\ No newline at end of file
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/vos_metric.py b/Bank_second_part/detect_process/paddlevideo/metrics/vos_metric.py
new file mode 100644
index 0000000..54eadb8
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/vos_metric.py
@@ -0,0 +1,276 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import os
+import paddle
+import zipfile
+import time
+from PIL import Image
+
+from paddle.io import DataLoader
+
+from .registry import METRIC
+from .base import BaseMetric
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@METRIC.register
+class VOSMetric(BaseMetric):
+    def __init__(self,
+                 data_size,
+                 batch_size,
+                 result_root,
+                 zip_dir,
+                 log_interval=1):
+        """prepare for metrics
+        """
+        super().__init__(data_size, batch_size, log_interval)
+        self.video_num = 0
+        self.total_time = 0
+        self.total_frame = 0
+        self.total_sfps = 0
+        self.total_video_num = data_size
+        self.count = 0
+        self.result_root = result_root
+        self.zip_dir = zip_dir
+
+    def update(self, batch_id, data, model):
+        """update metrics during each iter
+        """
+        self.video_num += 1
+        seq_dataset = data
+        seq_name = seq_dataset.seq_name
+
+        logger.info('Prcessing Seq {} [{}/{}]:'.format(seq_name, self.video_num,
+                                                       self.total_video_num))
+        seq_dataloader = DataLoader(seq_dataset,
+                                    return_list=True,
+                                    batch_size=1,
+                                    shuffle=False,
+                                    num_workers=0)
+        seq_total_time = 0
+        seq_total_frame = 0
+        ref_embeddings = []
+        ref_masks = []
+        prev_embedding = []
+        prev_mask = []
+        with paddle.no_grad():
+            for frame_idx, samples in enumerate(seq_dataloader):
+                time_start = time.time()
+                all_preds = []
+                join_label = None
+                for aug_idx in range(len(samples)):
+                    if len(ref_embeddings) <= aug_idx:
+                        ref_embeddings.append([])
+                        ref_masks.append([])
+                        prev_embedding.append(None)
+                        prev_mask.append(None)
+
+                    sample = samples[aug_idx]
+                    ref_emb = ref_embeddings[aug_idx]
+                    ref_m = ref_masks[aug_idx]
+                    prev_emb = prev_embedding[aug_idx]
+                    prev_m = prev_mask[aug_idx]
+
+                    current_img = sample['current_img']
+                    if 'current_label' in sample.keys():
+                        current_label = sample['current_label']
+                        current_label = paddle.to_tensor(current_label)
+                    else:
+                        current_label = None
+
+                    obj_num = sample['meta']['obj_num']
+                    imgname = sample['meta']['current_name']
+                    ori_height = sample['meta']['height']
+                    ori_width = sample['meta']['width']
+                    current_img = current_img
+                    obj_num = obj_num
+                    bs, _, h, w = current_img.shape
+                    data_batch = [
+                        ref_emb, ref_m, prev_emb, prev_m, current_img,
+                        [ori_height, ori_width], obj_num
+                    ]
+
+                    all_pred, current_embedding = model(data_batch, mode='test')
+
+                    if frame_idx == 0:
+                        if current_label is None:
+                            logger.info(
+                                "No first frame label in Seq {}.".format(
+                                    seq_name))
+                        ref_embeddings[aug_idx].append(current_embedding)
+                        ref_masks[aug_idx].append(current_label)
+
+                        prev_embedding[aug_idx] = current_embedding
+                        prev_mask[aug_idx] = current_label
+                    else:
+                        if sample['meta']['flip']:  #False
+                            all_pred = self.flip_tensor(all_pred, 3)
+                        #  In YouTube-VOS, not all the objects appear in the first frame for the first time. Thus, we
+                        #  have to introduce new labels for new objects, if necessary.
+                        if not sample['meta']['flip'] and not (
+                                current_label is None) and join_label is None:
+                            join_label = paddle.cast(current_label,
+                                                     dtype='int64')
+                        all_preds.append(all_pred)
+                        if current_label is not None:
+                            ref_embeddings[aug_idx].append(current_embedding)
+                        prev_embedding[aug_idx] = current_embedding
+
+                if frame_idx > 0:
+                    all_preds = paddle.concat(all_preds, axis=0)
+                    all_preds = paddle.mean(
+                        all_preds, axis=0)  #average results if augmentation
+                    pred_label = paddle.argmax(all_preds, axis=0)
+                    if join_label is not None:
+                        join_label = paddle.squeeze(paddle.squeeze(join_label,
+                                                                   axis=0),
+                                                    axis=0)
+                        keep = paddle.cast((join_label == 0), dtype="int64")
+                        pred_label = pred_label * keep + join_label * (1 - keep)
+                        pred_label = pred_label
+                    current_label = paddle.reshape(
+                        pred_label, shape=[1, 1, ori_height, ori_width])
+                    flip_pred_label = self.flip_tensor(pred_label, 1)
+                    flip_current_label = paddle.reshape(
+                        flip_pred_label, shape=[1, 1, ori_height, ori_width])
+
+                    for aug_idx in range(len(samples)):
+                        if join_label is not None:
+                            if samples[aug_idx]['meta']['flip']:
+                                ref_masks[aug_idx].append(flip_current_label)
+                            else:
+                                ref_masks[aug_idx].append(current_label)
+                        if samples[aug_idx]['meta']['flip']:
+                            prev_mask[aug_idx] = flip_current_label
+                        else:
+                            prev_mask[
+                                aug_idx] = current_label  #update prev_mask
+
+                    one_frametime = time.time() - time_start
+                    seq_total_time += one_frametime
+                    seq_total_frame += 1
+                    obj_num = float(obj_num)
+                    logger.info('Frame: {}, Obj Num: {}, Time: {}'.format(
+                        imgname[0], obj_num, one_frametime))
+                    self.save_mask(
+                        pred_label,
+                        os.path.join(self.result_root, seq_name,
+                                     imgname[0].split('.')[0] + '.png'))
+                else:
+                    one_frametime = time.time() - time_start
+                    seq_total_time += one_frametime
+                    logger.info('Ref Frame: {}, Time: {}'.format(
+                        imgname[0], one_frametime))
+
+            del (ref_embeddings)
+            del (ref_masks)
+            del (prev_embedding)
+            del (prev_mask)
+            del (seq_dataset)
+            del (seq_dataloader)
+
+        seq_avg_time_per_frame = seq_total_time / seq_total_frame
+        self.total_time += seq_total_time
+        self.total_frame += seq_total_frame
+        total_avg_time_per_frame = self.total_time / self.total_frame
+        self.total_sfps += seq_avg_time_per_frame
+        avg_sfps = self.total_sfps / (batch_id + 1)
+        logger.info("Seq {} FPS: {}, Total FPS: {}, FPS per Seq: {}".format(
+            seq_name, 1. / seq_avg_time_per_frame,
+            1. / total_avg_time_per_frame, 1. / avg_sfps))
+
+    def flip_tensor(self, tensor, dim=0):
+        inv_idx = paddle.cast(paddle.arange(tensor.shape[dim] - 1, -1, -1),
+                              dtype="int64")
+        tensor = paddle.index_select(x=tensor, index=inv_idx, axis=dim)
+        return tensor
+
+    def save_mask(self, mask_tensor, path):
+        _palette = [
+            0, 0, 0, 128, 0, 0, 0, 128, 0, 128, 128, 0, 0, 0, 128, 128, 0, 128,
+            0, 128, 128, 128, 128, 128, 64, 0, 0, 191, 0, 0, 64, 128, 0, 191,
+            128, 0, 64, 0, 128, 191, 0, 128, 64, 128, 128, 191, 128, 128, 0, 64,
+            0, 128, 64, 0, 0, 191, 0, 128, 191, 0, 0, 64, 128, 128, 64, 128, 22,
+            22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 26, 26, 26, 27, 27, 27,
+            28, 28, 28, 29, 29, 29, 30, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33,
+            33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 38, 38, 38, 39,
+            39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43, 43, 44, 44, 44,
+            45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 49, 49, 50, 50,
+            50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 56,
+            56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 61, 61, 61,
+            62, 62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67,
+            67, 68, 68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 73,
+            73, 73, 74, 74, 74, 75, 75, 75, 76, 76, 76, 77, 77, 77, 78, 78, 78,
+            79, 79, 79, 80, 80, 80, 81, 81, 81, 82, 82, 82, 83, 83, 83, 84, 84,
+            84, 85, 85, 85, 86, 86, 86, 87, 87, 87, 88, 88, 88, 89, 89, 89, 90,
+            90, 90, 91, 91, 91, 92, 92, 92, 93, 93, 93, 94, 94, 94, 95, 95, 95,
+            96, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 100, 100, 100, 101,
+            101, 101, 102, 102, 102, 103, 103, 103, 104, 104, 104, 105, 105,
+            105, 106, 106, 106, 107, 107, 107, 108, 108, 108, 109, 109, 109,
+            110, 110, 110, 111, 111, 111, 112, 112, 112, 113, 113, 113, 114,
+            114, 114, 115, 115, 115, 116, 116, 116, 117, 117, 117, 118, 118,
+            118, 119, 119, 119, 120, 120, 120, 121, 121, 121, 122, 122, 122,
+            123, 123, 123, 124, 124, 124, 125, 125, 125, 126, 126, 126, 127,
+            127, 127, 128, 128, 128, 129, 129, 129, 130, 130, 130, 131, 131,
+            131, 132, 132, 132, 133, 133, 133, 134, 134, 134, 135, 135, 135,
+            136, 136, 136, 137, 137, 137, 138, 138, 138, 139, 139, 139, 140,
+            140, 140, 141, 141, 141, 142, 142, 142, 143, 143, 143, 144, 144,
+            144, 145, 145, 145, 146, 146, 146, 147, 147, 147, 148, 148, 148,
+            149, 149, 149, 150, 150, 150, 151, 151, 151, 152, 152, 152, 153,
+            153, 153, 154, 154, 154, 155, 155, 155, 156, 156, 156, 157, 157,
+            157, 158, 158, 158, 159, 159, 159, 160, 160, 160, 161, 161, 161,
+            162, 162, 162, 163, 163, 163, 164, 164, 164, 165, 165, 165, 166,
+            166, 166, 167, 167, 167, 168, 168, 168, 169, 169, 169, 170, 170,
+            170, 171, 171, 171, 172, 172, 172, 173, 173, 173, 174, 174, 174,
+            175, 175, 175, 176, 176, 176, 177, 177, 177, 178, 178, 178, 179,
+            179, 179, 180, 180, 180, 181, 181, 181, 182, 182, 182, 183, 183,
+            183, 184, 184, 184, 185, 185, 185, 186, 186, 186, 187, 187, 187,
+            188, 188, 188, 189, 189, 189, 190, 190, 190, 191, 191, 191, 192,
+            192, 192, 193, 193, 193, 194, 194, 194, 195, 195, 195, 196, 196,
+            196, 197, 197, 197, 198, 198, 198, 199, 199, 199, 200, 200, 200,
+            201, 201, 201, 202, 202, 202, 203, 203, 203, 204, 204, 204, 205,
+            205, 205, 206, 206, 206, 207, 207, 207, 208, 208, 208, 209, 209,
+            209, 210, 210, 210, 211, 211, 211, 212, 212, 212, 213, 213, 213,
+            214, 214, 214, 215, 215, 215, 216, 216, 216, 217, 217, 217, 218,
+            218, 218, 219, 219, 219, 220, 220, 220, 221, 221, 221, 222, 222,
+            222, 223, 223, 223, 224, 224, 224, 225, 225, 225, 226, 226, 226,
+            227, 227, 227, 228, 228, 228, 229, 229, 229, 230, 230, 230, 231,
+            231, 231, 232, 232, 232, 233, 233, 233, 234, 234, 234, 235, 235,
+            235, 236, 236, 236, 237, 237, 237, 238, 238, 238, 239, 239, 239,
+            240, 240, 240, 241, 241, 241, 242, 242, 242, 243, 243, 243, 244,
+            244, 244, 245, 245, 245, 246, 246, 246, 247, 247, 247, 248, 248,
+            248, 249, 249, 249, 250, 250, 250, 251, 251, 251, 252, 252, 252,
+            253, 253, 253, 254, 254, 254, 255, 255, 255
+        ]
+        mask = mask_tensor.cpu().numpy().astype('uint8')
+        mask = Image.fromarray(mask).convert('P')
+        mask.putpalette(_palette)
+        mask.save(path)
+
+    def zip_folder(self, source_folder, zip_dir):
+        f = zipfile.ZipFile(zip_dir, 'w', zipfile.ZIP_DEFLATED)
+        pre_len = len(os.path.dirname(source_folder))
+        for dirpath, dirnames, filenames in os.walk(source_folder):
+            for filename in filenames:
+                pathfile = os.path.join(dirpath, filename)
+                arcname = pathfile[pre_len:].strip(os.path.sep)
+                f.write(pathfile, arcname)
+        f.close()
+
+    def accumulate(self):
+        """accumulate metrics when finished all iters.
+        """
+        self.zip_folder(self.result_root, self.zip_dir)
+        logger.info('Save result to {}.'.format(self.zip_dir))
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__init__.py b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..81d734c
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/average_precision_calculator.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/average_precision_calculator.cpython-310.pyc
new file mode 100644
index 0000000..a1817cc
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/average_precision_calculator.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/eval_util.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/eval_util.cpython-310.pyc
new file mode 100644
index 0000000..c6404cf
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/eval_util.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/mean_average_precision_calculator.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/mean_average_precision_calculator.cpython-310.pyc
new file mode 100644
index 0000000..1907e7a
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/mean_average_precision_calculator.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/average_precision_calculator.py b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/average_precision_calculator.py
new file mode 100644
index 0000000..bdbd6e0
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/average_precision_calculator.py
@@ -0,0 +1,274 @@
+# Copyright 2020 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Calculate or keep track of the interpolated average precision.
+
+It provides an interface for calculating interpolated average precision for an
+entire list or the top-n ranked items. For the definition of the
+(non-)interpolated average precision:
+http://trec.nist.gov/pubs/trec15/appendices/CE.MEASURES06.pdf
+
+Example usages:
+1) Use it as a static function call to directly calculate average precision for
+a short ranked list in the memory.
+
+```
+import random
+
+p = np.array([random.random() for _ in xrange(10)])
+a = np.array([random.choice([0, 1]) for _ in xrange(10)])
+
+ap = average_precision_calculator.AveragePrecisionCalculator.ap(p, a)
+```
+
+2) Use it as an object for long ranked list that cannot be stored in memory or
+the case where partial predictions can be observed at a time (Tensorflow
+predictions). In this case, we first call the function accumulate many times
+to process parts of the ranked list. After processing all the parts, we call
+peek_interpolated_ap_at_n.
+```
+p1 = np.array([random.random() for _ in xrange(5)])
+a1 = np.array([random.choice([0, 1]) for _ in xrange(5)])
+p2 = np.array([random.random() for _ in xrange(5)])
+a2 = np.array([random.choice([0, 1]) for _ in xrange(5)])
+
+# interpolated average precision at 10 using 1000 break points
+calculator = average_precision_calculator.AveragePrecisionCalculator(10)
+calculator.accumulate(p1, a1)
+calculator.accumulate(p2, a2)
+ap3 = calculator.peek_ap_at_n()
+```
+"""
+
+import heapq
+import random
+import numbers
+
+import numpy
+
+
+class AveragePrecisionCalculator(object):
+    """Calculate the average precision and average precision at n."""
+    def __init__(self, top_n=None):
+        """Construct an AveragePrecisionCalculator to calculate average precision.
+
+    This class is used to calculate the average precision for a single label.
+
+    Args:
+      top_n: A positive Integer specifying the average precision at n, or
+        None to use all provided data points.
+
+    Raises:
+      ValueError: An error occurred when the top_n is not a positive integer.
+    """
+        if not ((isinstance(top_n, int) and top_n >= 0) or top_n is None):
+            raise ValueError("top_n must be a positive integer or None.")
+
+        self._top_n = top_n  # average precision at n
+        self._total_positives = 0  # total number of positives have seen
+        self._heap = []  # max heap of (prediction, actual)
+
+    @property
+    def heap_size(self):
+        """Gets the heap size maintained in the class."""
+        return len(self._heap)
+
+    @property
+    def num_accumulated_positives(self):
+        """Gets the number of positive samples that have been accumulated."""
+        return self._total_positives
+
+    def accumulate(self, predictions, actuals, num_positives=None):
+        """Accumulate the predictions and their ground truth labels.
+
+    After the function call, we may call peek_ap_at_n to actually calculate
+    the average precision.
+    Note predictions and actuals must have the same shape.
+
+    Args:
+      predictions: a list storing the prediction scores.
+      actuals: a list storing the ground truth labels. Any value
+      larger than 0 will be treated as positives, otherwise as negatives.
+      num_positives = If the 'predictions' and 'actuals' inputs aren't complete,
+      then it's possible some true positives were missed in them. In that case,
+      you can provide 'num_positives' in order to accurately track recall.
+
+    Raises:
+      ValueError: An error occurred when the format of the input is not the
+      numpy 1-D array or the shape of predictions and actuals does not match.
+    """
+        if len(predictions) != len(actuals):
+            raise ValueError(
+                "the shape of predictions and actuals does not match.")
+
+        if not num_positives is None:
+            if not isinstance(num_positives,
+                              numbers.Number) or num_positives < 0:
+                raise ValueError(
+                    "'num_positives' was provided but it wan't a nonzero number."
+                )
+
+        if not num_positives is None:
+            self._total_positives += num_positives
+        else:
+            self._total_positives += numpy.size(numpy.where(actuals > 0))
+        topk = self._top_n
+        heap = self._heap
+
+        for i in range(numpy.size(predictions)):
+            if topk is None or len(heap) < topk:
+                heapq.heappush(heap, (predictions[i], actuals[i]))
+            else:
+                if predictions[i] > heap[0][0]:  # heap[0] is the smallest
+                    heapq.heappop(heap)
+                    heapq.heappush(heap, (predictions[i], actuals[i]))
+
+    def clear(self):
+        """Clear the accumulated predictions."""
+        self._heap = []
+        self._total_positives = 0
+
+    def peek_ap_at_n(self):
+        """Peek the non-interpolated average precision at n.
+
+    Returns:
+      The non-interpolated average precision at n (default 0).
+      If n is larger than the length of the ranked list,
+      the average precision will be returned.
+    """
+        if self.heap_size <= 0:
+            return 0
+        predlists = numpy.array(list(zip(*self._heap)))
+
+        ap = self.ap_at_n(predlists[0],
+                          predlists[1],
+                          n=self._top_n,
+                          total_num_positives=self._total_positives)
+        return ap
+
+    @staticmethod
+    def ap(predictions, actuals):
+        """Calculate the non-interpolated average precision.
+
+    Args:
+      predictions: a numpy 1-D array storing the sparse prediction scores.
+      actuals: a numpy 1-D array storing the ground truth labels. Any value
+      larger than 0 will be treated as positives, otherwise as negatives.
+
+    Returns:
+      The non-interpolated average precision at n.
+      If n is larger than the length of the ranked list,
+      the average precision will be returned.
+
+    Raises:
+      ValueError: An error occurred when the format of the input is not the
+      numpy 1-D array or the shape of predictions and actuals does not match.
+    """
+        return AveragePrecisionCalculator.ap_at_n(predictions, actuals, n=None)
+
+    @staticmethod
+    def ap_at_n(predictions, actuals, n=20, total_num_positives=None):
+        """Calculate the non-interpolated average precision.
+
+    Args:
+      predictions: a numpy 1-D array storing the sparse prediction scores.
+      actuals: a numpy 1-D array storing the ground truth labels. Any value
+      larger than 0 will be treated as positives, otherwise as negatives.
+      n: the top n items to be considered in ap@n.
+      total_num_positives : (optionally) you can specify the number of total
+        positive
+      in the list. If specified, it will be used in calculation.
+
+    Returns:
+      The non-interpolated average precision at n.
+      If n is larger than the length of the ranked list,
+      the average precision will be returned.
+
+    Raises:
+      ValueError: An error occurred when
+      1) the format of the input is not the numpy 1-D array;
+      2) the shape of predictions and actuals does not match;
+      3) the input n is not a positive integer.
+    """
+        if len(predictions) != len(actuals):
+            raise ValueError(
+                "the shape of predictions and actuals does not match.")
+
+        if n is not None:
+            if not isinstance(n, int) or n <= 0:
+                raise ValueError("n must be 'None' or a positive integer."
+                                 " It was '%s'." % n)
+
+        ap = 0.0
+
+        predictions = numpy.array(predictions)
+        actuals = numpy.array(actuals)
+
+        # add a shuffler to avoid overestimating the ap
+        predictions, actuals = AveragePrecisionCalculator._shuffle(
+            predictions, actuals)
+        sortidx = sorted(range(len(predictions)),
+                         key=lambda k: predictions[k],
+                         reverse=True)
+
+        if total_num_positives is None:
+            numpos = numpy.size(numpy.where(actuals > 0))
+        else:
+            numpos = total_num_positives
+
+        if numpos == 0:
+            return 0
+
+        if n is not None:
+            numpos = min(numpos, n)
+        delta_recall = 1.0 / numpos
+        poscount = 0.0
+
+        # calculate the ap
+        r = len(sortidx)
+        if n is not None:
+            r = min(r, n)
+        for i in range(r):
+            if actuals[sortidx[i]] > 0:
+                poscount += 1
+                ap += poscount / (i + 1) * delta_recall
+        return ap
+
+    @staticmethod
+    def _shuffle(predictions, actuals):
+        random.seed(0)
+        suffidx = random.sample(range(len(predictions)), len(predictions))
+        predictions = predictions[suffidx]
+        actuals = actuals[suffidx]
+        return predictions, actuals
+
+    @staticmethod
+    def _zero_one_normalize(predictions, epsilon=1e-7):
+        """Normalize the predictions to the range between 0.0 and 1.0.
+
+    For some predictions like SVM predictions, we need to normalize them before
+    calculate the interpolated average precision. The normalization will not
+    change the rank in the original list and thus won't change the average
+    precision.
+
+    Args:
+      predictions: a numpy 1-D array storing the sparse prediction scores.
+      epsilon: a small constant to avoid denominator being zero.
+
+    Returns:
+      The normalized prediction.
+    """
+        denominator = numpy.max(predictions) - numpy.min(predictions)
+        ret = (predictions - numpy.min(predictions)) / numpy.max(
+            denominator, epsilon)
+        return ret
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/eval_util.py b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/eval_util.py
new file mode 100644
index 0000000..724c72f
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/eval_util.py
@@ -0,0 +1,205 @@
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Provides functions to help with evaluating models."""
+import numpy as np
+import paddle
+from paddlevideo.utils import get_logger
+
+from ..base import BaseMetric
+from ..registry import METRIC
+from . import average_precision_calculator as ap_calculator
+from . import mean_average_precision_calculator as map_calculator
+
+logger = get_logger("paddlevideo")
+
+
+def flatten(l):
+    """ Merges a list of lists into a single list. """
+    return [item for sublist in l for item in sublist]
+
+
+def calculate_hit_at_one(predictions, actuals):
+    """
+    Hit@k: indicates the fraction of test samples that contain at least
+    one of the ground truth labels in the top k predictions,
+    i.e topk.
+
+    Args:
+        predictions: Matrix containing the outputs of the model.
+        Dimensions are 'batch' x 'num_classes'.
+        actuals: Matrix containing the ground truth labels.
+        Dimensions are 'batch' x 'num_classes'.
+
+    Returns:
+        float: The average hit at one across the entire batch.
+    """
+    top_prediction = np.argmax(predictions, 1)
+    hits = actuals[np.arange(actuals.shape[0]), top_prediction]
+    return np.mean(hits)
+
+
+def calculate_precision_at_equal_recall_rate(predictions, actuals):
+    """
+    PERR: measures the video-level annotation precision when we retrieve the same number
+     of entities per video as there are in the ground-truth.
+    More details please refer to:  https://arxiv.org/abs/1609.08675
+
+    Args:
+        predictions: Matrix containing the outputs of the model.
+        Dimensions are 'batch' x 'num_classes'.
+        actuals: Matrix containing the ground truth labels.
+        Dimensions are 'batch' x 'num_classes'.
+
+    Returns:
+        float: The average precision at equal recall rate across the entire batch.
+    """
+    aggregated_precision = 0.0
+    num_videos = actuals.shape[0]
+    for row in np.arange(num_videos):
+        num_labels = int(np.sum(actuals[row]))
+        top_indices = np.argpartition(predictions[row],
+                                      -num_labels)[-num_labels:]
+        item_precision = 0.0
+        for label_index in top_indices:
+            if predictions[row][label_index] > 0:
+                item_precision += actuals[row][label_index]
+        item_precision /= top_indices.size
+        aggregated_precision += item_precision
+    aggregated_precision /= num_videos
+    return aggregated_precision
+
+
+def calculate_gap(predictions, actuals, top_k=20):
+    """
+    GAP: the global average precision.
+
+    Only the top_k predictions are taken for each of the videos.
+
+    Args:
+        predictions: Matrix containing the outputs of the model.
+        Dimensions are 'batch' x 'num_classes'.
+        actuals: Matrix containing the ground truth labels.
+        Dimensions are 'batch' x 'num_classes'.
+        top_k: How many predictions to use per video.
+
+    Returns:
+        float: The global average precision.
+    """
+    gap_calculator = ap_calculator.AveragePrecisionCalculator()
+    sparse_predictions, sparse_labels, num_positives = top_k_by_class(
+        predictions, actuals, top_k)
+    gap_calculator.accumulate(flatten(sparse_predictions),
+                              flatten(sparse_labels), sum(num_positives))
+    return gap_calculator.peek_ap_at_n()
+
+
+def top_k_by_class(predictions, labels, k=20):
+    """Extracts the top k predictions for each video, sorted by class.
+
+    Args:
+        predictions: A numpy matrix containing the outputs of the model.
+        Dimensions are 'batch' x 'num_classes'.
+        k: the top k non-zero entries to preserve in each prediction.
+
+    Returns:
+        A tuple (predictions,labels, true_positives). 'predictions' and 'labels'
+        are lists of lists of floats. 'true_positives' is a list of scalars. The
+        length of the lists are equal to the number of classes. The entries in the
+        predictions variable are probability predictions, and
+        the corresponding entries in the labels variable are the ground truth for
+        those predictions. The entries in 'true_positives' are the number of true
+        positives for each class in the ground truth.
+
+    Raises:
+        ValueError: An error occurred when the k is not a positive integer.
+    """
+    if k <= 0:
+        raise ValueError("k must be a positive integer.")
+    k = min(k, predictions.shape[1])
+    num_classes = predictions.shape[1]
+    prediction_triplets = []
+    for video_index in range(predictions.shape[0]):
+        prediction_triplets.extend(
+            top_k_triplets(predictions[video_index], labels[video_index], k))
+    out_predictions = [[] for v in range(num_classes)]
+    out_labels = [[] for v in range(num_classes)]
+    for triplet in prediction_triplets:
+        out_predictions[triplet[0]].append(triplet[1])
+        out_labels[triplet[0]].append(triplet[2])
+    out_true_positives = [np.sum(labels[:, i]) for i in range(num_classes)]
+
+    return out_predictions, out_labels, out_true_positives
+
+
+def top_k_triplets(predictions, labels, k=20):
+    """Get the top_k for a 1-d numpy array. Returns a sparse list of tuples in
+    (prediction, class) format"""
+    m = len(predictions)
+    k = min(k, m)
+    indices = np.argpartition(predictions, -k)[-k:]
+    return [(index, predictions[index], labels[index]) for index in indices]
+
+
+@METRIC.register
+class HitOneMetric(BaseMetric):
+    """A class to store the evaluation metrics."""
+    def __init__(self,
+                 num_class,
+                 top_k,
+                 data_size,
+                 batch_size,
+                 log_interval=20):
+        """Construct an HitOneMetric object to store the evaluation metrics."""
+        self.hit_at_one = []
+        self.perr = []
+        self.gap = []
+        super().__init__(data_size, batch_size, log_interval)
+
+    def accumulate(self):
+        logger.info(
+            '[TEST] finished, hit_at_one = {:.5f}, perr = {:.5f}, gap = {:.5f}'.
+            format(np.mean(np.array(self.hit_at_one)),
+                   np.mean(np.array(self.perr)), np.mean(np.array(self.gap))))
+
+    def clear(self):
+        """Clear the evaluation metrics and reset the HitOneMetric object."""
+        self.hit_at_one = []
+        self.perr = []
+        self.gap = []
+
+    def update(self, batch_id, data, outputs):
+        """update metrics during each iter
+        """
+        hit_at_one = paddle.to_tensor(outputs['hit_at_one'])
+        perr = paddle.to_tensor(outputs['perr'])
+        gap = paddle.to_tensor(outputs['gap'])
+        # NOTE(shipping): deal with multi cards validate
+        if self.world_size > 1:
+            hit_at_one = paddle.distributed.all_reduce(
+                hit_at_one,
+                op=paddle.distributed.ReduceOp.SUM) / self.world_size
+            perr = paddle.distributed.all_reduce(
+                perr, op=paddle.distributed.ReduceOp.SUM) / self.world_size
+            gap = paddle.distributed.all_reduce(
+                gap, op=paddle.distributed.ReduceOp.SUM) / self.world_size
+
+        self.hit_at_one.append(hit_at_one.numpy())
+        self.perr.append(perr.numpy())
+        self.gap.append(gap.numpy())
+        # preds ensemble
+        if batch_id % self.log_interval == 0:
+            logger.info("[TEST] Processing batch {}/{}...".format(
+                batch_id,
+                self.data_size // (self.batch_size * self.world_size),
+            ))
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py
new file mode 100644
index 0000000..0ae8b0e
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py
@@ -0,0 +1,114 @@
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Calculate the mean average precision.
+
+It provides an interface for calculating mean average precision
+for an entire list or the top-n ranked items.
+
+Example usages:
+We first call the function accumulate many times to process parts of the ranked
+list. After processing all the parts, we call peek_map_at_n
+to calculate the mean average precision.
+
+```
+import random
+
+p = np.array([[random.random() for _ in xrange(50)] for _ in xrange(1000)])
+a = np.array([[random.choice([0, 1]) for _ in xrange(50)]
+     for _ in xrange(1000)])
+
+# mean average precision for 50 classes.
+calculator = mean_average_precision_calculator.MeanAveragePrecisionCalculator(
+            num_class=50)
+calculator.accumulate(p, a)
+aps = calculator.peek_map_at_n()
+```
+"""
+
+import numpy
+from . import average_precision_calculator
+
+
+class MeanAveragePrecisionCalculator(object):
+    """This class is to calculate mean average precision.
+  """
+
+    def __init__(self, num_class):
+        """Construct a calculator to calculate the (macro) average precision.
+
+    Args:
+      num_class: A positive Integer specifying the number of classes.
+      top_n_array: A list of positive integers specifying the top n for each
+      class. The top n in each class will be used to calculate its average
+      precision at n.
+      The size of the array must be num_class.
+
+    Raises:
+      ValueError: An error occurred when num_class is not a positive integer;
+      or the top_n_array is not a list of positive integers.
+    """
+        if not isinstance(num_class, int) or num_class <= 1:
+            raise ValueError("num_class must be a positive integer.")
+
+        self._ap_calculators = []  # member of AveragePrecisionCalculator
+        self._num_class = num_class  # total number of classes
+        for i in range(num_class):
+            self._ap_calculators.append(
+                average_precision_calculator.AveragePrecisionCalculator())
+
+    def accumulate(self, predictions, actuals, num_positives=None):
+        """Accumulate the predictions and their ground truth labels.
+
+    Args:
+      predictions: A list of lists storing the prediction scores. The outer
+      dimension corresponds to classes.
+      actuals: A list of lists storing the ground truth labels. The dimensions
+      should correspond to the predictions input. Any value
+      larger than 0 will be treated as positives, otherwise as negatives.
+      num_positives: If provided, it is a list of numbers representing the
+      number of true positives for each class. If not provided, the number of
+      true positives will be inferred from the 'actuals' array.
+
+    Raises:
+      ValueError: An error occurred when the shape of predictions and actuals
+      does not match.
+    """
+        if not num_positives:
+            num_positives = [None for i in predictions.shape[1]]
+
+        calculators = self._ap_calculators
+        for i in range(len(predictions)):
+            calculators[i].accumulate(predictions[i], actuals[i],
+                                      num_positives[i])
+
+    def clear(self):
+        for calculator in self._ap_calculators:
+            calculator.clear()
+
+    def is_empty(self):
+        return ([calculator.heap_size for calculator in self._ap_calculators] ==
+                [0 for _ in range(self._num_class)])
+
+    def peek_map_at_n(self):
+        """Peek the non-interpolated mean average precision at n.
+
+    Returns:
+      An array of non-interpolated average precision at n (default 0) for each
+      class.
+    """
+        aps = [
+            self._ap_calculators[i].peek_ap_at_n()
+            for i in range(self._num_class)
+        ]
+        return aps
diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/yowo_metric.py b/Bank_second_part/detect_process/paddlevideo/metrics/yowo_metric.py
new file mode 100644
index 0000000..032df0c
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/yowo_metric.py
@@ -0,0 +1,82 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import os
+from paddlevideo.utils import get_logger
+from .registry import METRIC
+from .base import BaseMetric
+from .ucf24_utils import get_mAP
+
+logger = get_logger("paddlevideo")
+
+
+@METRIC.register
+class YOWOMetric(BaseMetric):
+    """
+    Metrics for YOWO. Two Stages in this metric:
+    (1) Get test results using trained model, results will be saved in YOWOMetric.result_path;
+    (2) Calculate metrics using results file from stage (1).
+    """
+
+    def __init__(self,
+                 data_size,
+                 batch_size,
+                 gt_folder,
+                 result_path,
+                 threshold=0.5,
+                 save_path=None,
+                 log_interval=1):
+        """
+        Init for BMN metrics.
+        Params:
+            gtfolder:groundtruth folder path for ucf24
+        """
+        super().__init__(data_size, batch_size, log_interval)
+        self.result_path = result_path
+        self.gt_folder = gt_folder
+        self.threshold = threshold
+        self.save_path = save_path
+
+        if not osp.isdir(self.result_path):
+            os.makedirs(self.result_path)
+
+    def update(self, batch_id, data, outputs):
+        frame_idx = outputs['frame_idx']
+        boxes = outputs["boxes"]
+        for j in range(len(frame_idx)):
+            detection_path = osp.join(self.result_path, frame_idx[j])
+            with open(detection_path, 'w+') as f_detect:
+                for box in boxes[j]:
+                    x1 = round(float(box[0] - box[2] / 2.0) * 320.0)
+                    y1 = round(float(box[1] - box[3] / 2.0) * 240.0)
+                    x2 = round(float(box[0] + box[2] / 2.0) * 320.0)
+                    y2 = round(float(box[1] + box[3] / 2.0) * 240.0)
+
+                    det_conf = float(box[4])
+                    for j in range((len(box) - 5) // 2):
+                        cls_conf = float(box[5 + 2 * j].item())
+                        prob = det_conf * cls_conf
+                        f_detect.write(
+                            str(int(box[6]) + 1) + ' ' + str(prob) + ' ' + str(x1) + ' ' + str(y1) + ' ' + str(
+                                x2) + ' ' + str(y2) + '\n')
+        if batch_id % self.log_interval == 0:
+            logger.info("[TEST] Processing batch {}/{} ...".format(
+                batch_id,
+                self.data_size // (self.batch_size * self.world_size)))
+
+    def accumulate(self):
+        metric_list = get_mAP(self.gt_folder, self.result_path, self.threshold, self.save_path)
+        for info in metric_list:
+            logger.info(info)
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/__init__.py
new file mode 100644
index 0000000..639bd34
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/__init__.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .assigners import MaxIoUAssignerAVA
+from .backbones import ResNet
+from .builder import (build_backbone, build_head, build_localizer, build_loss,
+                      build_recognizer)
+from .framework.detectors import BaseDetector, FastRCNN, TwoStageDetector
+from .framework.recognizers import BaseRecognizer, Recognizer2D
+from .heads import (AVARoIHead, BaseHead, BBoxHeadAVA, SingleRoIExtractor3D,
+                    TSNHead)
+from .losses import CrossEntropyLoss
+from .registry import (BACKBONES, DETECTORS, HEADS, LOCALIZERS, LOSSES,
+                       PARTITIONERS, RECOGNIZERS, ROI_EXTRACTORS)
+from .samplers import RandomSampler
+from .weight_init import kaiming_normal_, trunc_normal_, weight_init_
+
+__all__ = [
+    'BACKBONES', 'HEADS', 'RECOGNIZERS', 'LOCALIZERS', 'PARTITIONERS', 'LOSSES',
+    'build_recognizer', 'build_localizer', 'build_head', 'build_backbone',
+    'build_loss', 'ResNet', 'TSNHead', 'BaseHead', 'BaseRecognizer',
+    'Recognizer2d', 'CrossEntropyLoss', 'ROI_EXTRACTORS',
+    'SingleRoIExtractor3D', 'AVARoIHead', 'BBoxHeadAVA', 'MaxIoUAssignerAVA',
+    'RandomSampler', 'DETECTORS', 'kaiming_normal_', 'trunc_normal_',
+    'weight_init_'
+]
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..172853e
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/bbox_utils.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/bbox_utils.cpython-310.pyc
new file mode 100644
index 0000000..8a6a2b9
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/bbox_utils.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/builder.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/builder.cpython-310.pyc
new file mode 100644
index 0000000..53b88d9
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/builder.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/registry.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/registry.cpython-310.pyc
new file mode 100644
index 0000000..ffcc723
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/registry.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/weight_init.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/weight_init.cpython-310.pyc
new file mode 100644
index 0000000..8b4c7aa
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/weight_init.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/assigners/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/assigners/__init__.py
new file mode 100644
index 0000000..a4570db
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/assigners/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .max_iou_assigner_ava import MaxIoUAssignerAVA
+
+__all__ = ['MaxIoUAssignerAVA']
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/assigners/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/assigners/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..7eb2115
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/assigners/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/assigners/__pycache__/max_iou_assigner_ava.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/assigners/__pycache__/max_iou_assigner_ava.cpython-310.pyc
new file mode 100644
index 0000000..485995b
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/assigners/__pycache__/max_iou_assigner_ava.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/assigners/max_iou_assigner_ava.py b/Bank_second_part/detect_process/paddlevideo/modeling/assigners/max_iou_assigner_ava.py
new file mode 100644
index 0000000..5cc72bf
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/assigners/max_iou_assigner_ava.py
@@ -0,0 +1,148 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import numpy as np
+from ..registry import BBOX_ASSIGNERS
+from ..bbox_utils import bbox_overlaps
+
+class AssignResult():
+    def __init__(self, num_gts, gt_inds, max_overlaps, labels=None):
+        self.num_gts = num_gts
+        self.gt_inds = gt_inds
+        self.max_overlaps = max_overlaps
+        self.labels = labels
+
+    def add_gt_(self, gt_labels):
+        """Add ground truth as assigned results.  """
+        self_inds = paddle.arange(1, len(gt_labels) + 1, dtype="int32")
+        gt_inds_squeeze = paddle.squeeze(self.gt_inds, axis=0)
+        self.gt_inds = paddle.concat([self_inds, gt_inds_squeeze])
+        gt_label_ones = paddle.full((len(gt_labels), ), 1, dtype='float32')
+        max_overlaps_squeeze = paddle.squeeze(self.max_overlaps, axis=0)
+        self.max_overlaps = paddle.concat([gt_label_ones, max_overlaps_squeeze])
+        if self.labels is not None:
+            self.labels = paddle.concat([gt_labels, self.labels])
+
+@BBOX_ASSIGNERS.register()
+class MaxIoUAssignerAVA():
+    """Assign a corresponding gt bbox or background to each bbox.  """
+    def __init__(self,
+                 pos_iou_thr,
+                 neg_iou_thr,
+                 min_pos_iou=.0,
+                 gt_max_assign_all=True,
+                 ignore_iof_thr=-1,
+                 ignore_wrt_candidates=True,
+                 match_low_quality=True,
+                 gpu_assign_thr=-1,
+                 iou_calculator=dict(type='BboxOverlaps2D')):
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.ignore_iof_thr = ignore_iof_thr
+        self.ignore_wrt_candidates = ignore_wrt_candidates
+        self.gpu_assign_thr = gpu_assign_thr
+        self.match_low_quality = match_low_quality
+
+    def assign(self, 
+               bboxes, 
+               gt_bboxes, 
+               gt_labels=None):
+        """Assign gt to bboxes.  """
+        overlaps = bbox_overlaps(gt_bboxes, bboxes)
+        assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+        return assign_result
+
+    def assign_wrt_overlaps(self, overlaps, gt_labels=None):
+        """Assign w.r.t. the overlaps of bboxes with gts.  """
+        num_gts, num_bboxes = overlaps.shape[0], overlaps.shape[1]
+        # 1. assign -1
+        assigned_gt_inds = paddle.full((num_bboxes, ), -1, dtype='int32')
+
+        # for each anchor, which gt best overlaps with it
+        # for each anchor, the max iou of all gts
+        max_overlaps, argmax_overlaps = paddle.topk(overlaps, k=1, axis=0)
+        # for each gt, which anchor best overlaps with it
+        # for each gt, the max iou of all proposals
+        gt_max_overlaps, gt_argmax_overlaps = paddle.topk(overlaps, k=1, axis=1) 
+
+        # 2. assign negative: below the negative inds are set to be 0
+        match_labels = paddle.full(argmax_overlaps.shape, -1, dtype='int32')
+        match_labels = paddle.where(max_overlaps < self.neg_iou_thr,
+                            paddle.zeros_like(match_labels), match_labels)
+
+        # 3. assign positive: above positive IoU threshold
+        argmax_overlaps_int32 = paddle.cast(argmax_overlaps, 'int32')
+        match_labels = paddle.where(max_overlaps >= self.pos_iou_thr,
+                                argmax_overlaps_int32 + 1, match_labels)
+        assigned_gt_inds = match_labels
+        if self.match_low_quality:
+            # Low-quality matching will overwirte the assigned_gt_inds
+            # assigned in Step 3. Thus, the assigned gt might not be the
+            # best one for prediction.
+            # For example, if bbox A has 0.9 and 0.8 iou with GT bbox
+            # 1 & 2, bbox 1 will be assigned as the best target for bbox A
+            # in step 3. However, if GT bbox 2's gt_argmax_overlaps = A,
+            # bbox A's assigned_gt_inds will be overwritten to be bbox B.
+            # This might be the reason that it is not used in ROI Heads.
+            for i in range(num_gts):
+                if gt_max_overlaps.numpy()[i] >= self.min_pos_iou:
+                    if self.gt_max_assign_all:
+                        equal_x_np = overlaps[i, :].numpy()
+                        equal_y_np = gt_max_overlaps[i].numpy()
+                        max_iou_inds = np.equal(equal_x_np, equal_y_np)
+                        max_iou_inds = paddle.to_tensor(max_iou_inds)
+                        max_iou_inds = paddle.reshape( max_iou_inds, [1,max_iou_inds.shape[0]] )
+                        match_labels_gts = paddle.full(max_iou_inds.shape, i+1, dtype='int32')
+                        match_labels = paddle.where(max_iou_inds, match_labels_gts, match_labels)
+                        assigned_gt_inds = match_labels
+                    else:
+                        assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1
+
+        if gt_labels is not None:
+            # consider multi-class case (AVA)
+            assert len(gt_labels[0]) > 1
+            assigned_labels = paddle.full([num_bboxes, len(gt_labels[0])], 0, dtype='float32')
+            assigned_gt_inds_reshape = assigned_gt_inds.reshape([assigned_gt_inds.shape[1]])
+            pos_inds = paddle.nonzero( assigned_gt_inds_reshape , as_tuple=False)
+            pos_inds_num = float(paddle.numel(pos_inds))
+            if pos_inds_num > 0:
+                pos_inds = paddle.squeeze(pos_inds, axis = 1 )
+                assigned_gt_inds_squeeze = paddle.squeeze(assigned_gt_inds, axis=0)
+                assigned_gt_inds_select = paddle.index_select(assigned_gt_inds_squeeze, pos_inds) - 1
+                gt_labels_select = paddle.index_select(gt_labels, assigned_gt_inds_select)
+                A = assigned_gt_inds_squeeze
+                X = assigned_gt_inds_squeeze - 1
+                Y = paddle.zeros_like(X)
+                if A.shape[0]==1:
+                    if float(A) > 0:
+                        T=X
+                    else:
+                        T=Y
+                else:
+                    T = paddle.where(A>0, X, Y)
+                S = paddle.index_select(gt_labels, T)
+                AE = paddle.expand(A, [S.shape[1], A.shape[0]]) 
+                AET = paddle.transpose(AE, perm=[1, 0])
+                R = paddle.where(AET>0, S, assigned_labels) 
+                assigned_labels = R
+        else:
+            assigned_labels = None
+        ret = AssignResult(
+            num_gts,
+            assigned_gt_inds,
+            max_overlaps,
+            labels=assigned_labels)
+        return ret
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__init__.py
new file mode 100644
index 0000000..a88cedc
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__init__.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .actbert import BertForMultiModalPreTraining
+from .adds import ADDS_DepthNet
+from .agcn import AGCN
+from .asrf import ASRF
+from .bmn import BMN
+from .cfbi import CFBI
+from .movinet import MoViNet
+from .ms_tcn import MSTCN
+from .resnet import ResNet
+from .resnet_slowfast import ResNetSlowFast
+from .resnet_slowfast_MRI import ResNetSlowFast_MRI
+from .resnet_tsm import ResNetTSM
+from .resnet_tsm_MRI import ResNetTSM_MRI
+from .resnet_tsn_MRI import ResNetTSN_MRI
+from .resnet_tweaks_tsm import ResNetTweaksTSM
+from .resnet_tweaks_tsn import ResNetTweaksTSN
+from .stgcn import STGCN
+from .swin_transformer import SwinTransformer3D
+from .transnetv2 import TransNetV2
+from .vit import VisionTransformer
+from .vit_tweaks import VisionTransformer_tweaks
+from .ms_tcn import MSTCN
+from .asrf import ASRF
+from .resnet_tsn_MRI import ResNetTSN_MRI
+from .resnet_tsm_MRI import ResNetTSM_MRI
+from .resnet_slowfast_MRI import ResNetSlowFast_MRI
+from .cfbi import CFBI
+from .ctrgcn import CTRGCN
+from .agcn2s import AGCN2s
+from .movinet import MoViNet
+from .resnet3d_slowonly import ResNet3dSlowOnly
+from .toshift_vit import TokenShiftVisionTransformer
+from .pptsm_mv2 import PPTSM_MobileNetV2
+from .pptsm_mv3 import PPTSM_MobileNetV3
+from .pptsm_v2 import PPTSM_v2
+from .yowo import YOWO
+
+__all__ = [
+    'ResNet', 'ResNetTSM', 'ResNetTweaksTSM', 'ResNetSlowFast', 'BMN',
+    'ResNetTweaksTSN', 'VisionTransformer', 'STGCN', 'AGCN', 'TransNetV2',
+    'ADDS_DepthNet', 'VisionTransformer_tweaks', 'BertForMultiModalPreTraining',
+    'ResNetTSN_MRI', 'ResNetTSM_MRI', 'ResNetSlowFast_MRI', 'CFBI', 'MSTCN',
+    'ASRF', 'MoViNet', 'SwinTransformer3D', 'CTRGCN',
+    'TokenShiftVisionTransformer', 'AGCN2s', 'PPTSM_MobileNetV2',
+    'PPTSM_MobileNetV3', 'PPTSM_v2', 'ResNet3dSlowOnly', 'YOWO'
+]
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..754b221
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/actbert.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/actbert.cpython-310.pyc
new file mode 100644
index 0000000..7d92dc9
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/actbert.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/adds.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/adds.cpython-310.pyc
new file mode 100644
index 0000000..24f8479
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/adds.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/agcn.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/agcn.cpython-310.pyc
new file mode 100644
index 0000000..dbf6f27
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/agcn.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/agcn2s.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/agcn2s.cpython-310.pyc
new file mode 100644
index 0000000..d2ee340
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/agcn2s.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/asrf.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/asrf.cpython-310.pyc
new file mode 100644
index 0000000..3732e07
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/asrf.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/bmn.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/bmn.cpython-310.pyc
new file mode 100644
index 0000000..faa0914
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/bmn.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/cfbi.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/cfbi.cpython-310.pyc
new file mode 100644
index 0000000..63efba1
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/cfbi.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/ctrgcn.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/ctrgcn.cpython-310.pyc
new file mode 100644
index 0000000..86eb240
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/ctrgcn.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/darknet.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/darknet.cpython-310.pyc
new file mode 100644
index 0000000..ea3eb95
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/darknet.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/deeplab.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/deeplab.cpython-310.pyc
new file mode 100644
index 0000000..6fe7a62
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/deeplab.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/movinet.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/movinet.cpython-310.pyc
new file mode 100644
index 0000000..7a42080
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/movinet.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/ms_tcn.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/ms_tcn.cpython-310.pyc
new file mode 100644
index 0000000..88c6561
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/ms_tcn.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/pptsm_mv2.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/pptsm_mv2.cpython-310.pyc
new file mode 100644
index 0000000..ed5a0fd
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/pptsm_mv2.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/pptsm_mv3.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/pptsm_mv3.cpython-310.pyc
new file mode 100644
index 0000000..c701ace
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/pptsm_mv3.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/pptsm_v2.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/pptsm_v2.cpython-310.pyc
new file mode 100644
index 0000000..19c44f4
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/pptsm_v2.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet.cpython-310.pyc
new file mode 100644
index 0000000..47ddaac
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet3d.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet3d.cpython-310.pyc
new file mode 100644
index 0000000..1229e2d
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet3d.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet3d_slowonly.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet3d_slowonly.cpython-310.pyc
new file mode 100644
index 0000000..fb791ab
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet3d_slowonly.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_slowfast.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_slowfast.cpython-310.pyc
new file mode 100644
index 0000000..bd78377
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_slowfast.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_slowfast_MRI.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_slowfast_MRI.cpython-310.pyc
new file mode 100644
index 0000000..7a72c27
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_slowfast_MRI.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tsm.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tsm.cpython-310.pyc
new file mode 100644
index 0000000..7708769
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tsm.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tsm_MRI.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tsm_MRI.cpython-310.pyc
new file mode 100644
index 0000000..0c86eae
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tsm_MRI.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tsn_MRI.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tsn_MRI.cpython-310.pyc
new file mode 100644
index 0000000..4825ba0
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tsn_MRI.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tweaks_tsm.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tweaks_tsm.cpython-310.pyc
new file mode 100644
index 0000000..9181cc7
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tweaks_tsm.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tweaks_tsn.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tweaks_tsn.cpython-310.pyc
new file mode 100644
index 0000000..86c321f
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tweaks_tsn.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnext101.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnext101.cpython-310.pyc
new file mode 100644
index 0000000..8d8419a
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnext101.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/stgcn.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/stgcn.cpython-310.pyc
new file mode 100644
index 0000000..ec6d4cb
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/stgcn.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/swin_transformer.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/swin_transformer.cpython-310.pyc
new file mode 100644
index 0000000..c45b075
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/swin_transformer.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/toshift_vit.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/toshift_vit.cpython-310.pyc
new file mode 100644
index 0000000..fbe323a
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/toshift_vit.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/transnetv2.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/transnetv2.cpython-310.pyc
new file mode 100644
index 0000000..eba1461
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/transnetv2.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/vit.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/vit.cpython-310.pyc
new file mode 100644
index 0000000..2421c7e
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/vit.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/vit_tweaks.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/vit_tweaks.cpython-310.pyc
new file mode 100644
index 0000000..2bd75fc
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/vit_tweaks.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/yowo.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/yowo.cpython-310.pyc
new file mode 100644
index 0000000..6210026
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/yowo.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/actbert.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/actbert.py
new file mode 100644
index 0000000..dbee1fd
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/actbert.py
@@ -0,0 +1,1158 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import numpy as np
+import math
+import copy
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout)
+from paddle.nn.initializer import Constant, Normal
+from ...utils.save_load import load_ckpt
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+
+ACT2FN = {"gelu": F.gelu, "relu": F.relu, "swish": F.swish}
+
+
+class BertEmbeddings(nn.Layer):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+    def __init__(self, vocab_size, max_position_embeddings, type_vocab_size,
+                 hidden_size, hidden_dropout_prob):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(vocab_size,
+                                            hidden_size,
+                                            padding_idx=0)
+        self.position_embeddings = nn.Embedding(max_position_embeddings,
+                                                hidden_size)
+        self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)
+
+        self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)
+        self.dropout = nn.Dropout(hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids=None):
+        seq_length = input_ids.shape[1]
+        position_ids = paddle.arange(end=seq_length, dtype="int64")
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(input_ids)
+
+        words_embeddings = self.word_embeddings(input_ids)  #8,36  -> 8,36,768
+        position_embeddings = self.position_embeddings(
+            position_ids)  #8,36  -> 8,36,768
+        token_type_embeddings = self.token_type_embeddings(
+            token_type_ids)  #8,36  -> 8,36,768
+
+        embeddings = words_embeddings + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertImageEmbeddings(nn.Layer):
+    def __init__(self, v_feature_size, v_hidden_size, v_hidden_dropout_prob):
+        super(BertImageEmbeddings, self).__init__()
+        self.image_embeddings = nn.Linear(v_feature_size, v_hidden_size)
+        self.image_location_embeddings = nn.Linear(5, v_hidden_size)
+        self.LayerNorm = nn.LayerNorm(v_hidden_size, epsilon=1e-12)
+        self.dropout = nn.Dropout(v_hidden_dropout_prob)
+
+    def forward(self, input_ids, input_loc):
+        img_embeddings = self.image_embeddings(
+            input_ids)  #8,37,2048 -> 8,37,1024
+        loc_embeddings = self.image_location_embeddings(
+            input_loc)  #8,37,5 -> 8,37,1024
+        embeddings = self.LayerNorm(img_embeddings + loc_embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings  # shape: bs*seq_len*hs
+
+
+class BertActionEmbeddings(nn.Layer):
+    def __init__(self, a_feature_size, a_hidden_size, a_hidden_dropout_prob):
+        super(BertActionEmbeddings, self).__init__()
+        self.action_embeddings = nn.Linear(a_feature_size, a_hidden_size)
+        self.LayerNorm = nn.LayerNorm(a_hidden_size, epsilon=1e-12)
+        self.dropout = nn.Dropout(a_hidden_dropout_prob)
+
+    def forward(self, input_ids):
+        action_embeddings = self.action_embeddings(
+            input_ids)  #8,5,2048 -> 8,5,768
+        embeddings = self.LayerNorm(action_embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Layer):
+    def __init__(self, hidden_size, num_attention_heads,
+                 attention_probs_dropout_prob):
+        super(BertSelfAttention, self).__init__()
+        if hidden_size % num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (hidden_size, num_attention_heads))
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_size = int(hidden_size / num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(hidden_size, self.all_head_size)
+        self.key = nn.Linear(hidden_size, self.all_head_size)
+        self.value = nn.Linear(hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.shape[:-1] + [
+            self.num_attention_heads,
+            self.attention_head_size,
+        ]
+        x = x.reshape(new_x_shape)
+        return x.transpose((0, 2, 1, 3))
+
+    def forward(self, hidden_states, attention_mask):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_layer,
+                                         key_layer.transpose((0, 1, 3, 2)))
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(axis=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = paddle.matmul(attention_probs, value_layer)
+        context_layer = context_layer.transpose((0, 2, 1, 3))
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.all_head_size
+        ]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        return context_layer, attention_probs
+
+
+class BertSelfOutput(nn.Layer):
+    def __init__(self, hidden_size, hidden_dropout_prob):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)
+        self.dropout = nn.Dropout(hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Layer):
+    def __init__(self, hidden_size, hidden_dropout_prob, num_attention_heads,
+                 attention_probs_dropout_prob):
+        super(BertAttention, self).__init__()
+        self.self = BertSelfAttention(hidden_size, num_attention_heads,
+                                      attention_probs_dropout_prob)
+        self.output = BertSelfOutput(hidden_size, hidden_dropout_prob)
+
+    def forward(self, input_tensor, attention_mask):
+        self_output, attention_probs = self.self(input_tensor, attention_mask)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output, attention_probs
+
+
+class BertIntermediate(nn.Layer):
+    def __init__(self, hidden_size, intermediate_size, hidden_act):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Linear(hidden_size, intermediate_size)
+        if isinstance(hidden_act, str) or (sys.version_info[0] == 2
+                                           and isinstance(hidden_act, str)):
+            self.intermediate_act_fn = ACT2FN[hidden_act]
+        else:
+            self.intermediate_act_fn = hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Layer):
+    def __init__(self, intermediate_size, hidden_size, hidden_dropout_prob):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(intermediate_size, hidden_size)
+        self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)
+        self.dropout = nn.Dropout(hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertEntAttention(nn.Layer):
+    """Core mudule of tangled transformer.
+    """
+    def __init__(
+        self,
+        hidden_size,
+        v_hidden_size,
+        a_hidden_size,
+        bi_hidden_size,
+        attention_probs_dropout_prob,
+        v_attention_probs_dropout_prob,
+        a_attention_probs_dropout_prob,
+        av_attention_probs_dropout_prob,
+        at_attention_probs_dropout_prob,
+        bi_num_attention_heads,
+    ):
+        super(BertEntAttention, self).__init__()
+        if bi_hidden_size % bi_num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (bi_hidden_size, bi_num_attention_heads))
+
+        self.num_attention_heads = bi_num_attention_heads
+        self.attention_head_size = int(bi_hidden_size / bi_num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        # self attention layers for vision input
+        self.query1 = nn.Linear(v_hidden_size, self.all_head_size)
+        self.key1 = nn.Linear(v_hidden_size, self.all_head_size)
+        self.value1 = nn.Linear(v_hidden_size, self.all_head_size)
+        self.dropout1 = nn.Dropout(v_attention_probs_dropout_prob)
+
+        # self attention layers for text input
+        self.query2 = nn.Linear(hidden_size, self.all_head_size)
+        self.key2 = nn.Linear(hidden_size, self.all_head_size)
+        self.value2 = nn.Linear(hidden_size, self.all_head_size)
+        self.dropout2 = nn.Dropout(attention_probs_dropout_prob)
+
+        # self attention layers for action input
+        self.query3 = nn.Linear(a_hidden_size, self.all_head_size)
+        self.key3 = nn.Linear(a_hidden_size, self.all_head_size)
+        self.value3 = nn.Linear(a_hidden_size, self.all_head_size)
+        self.dropout3 = nn.Dropout(a_attention_probs_dropout_prob)
+
+        # self attention layers for action_text
+        self.key_at = nn.Linear(bi_hidden_size, self.all_head_size)
+        self.value_at = nn.Linear(bi_hidden_size, self.all_head_size)
+        self.dropout_at = nn.Dropout(av_attention_probs_dropout_prob)
+
+        # self attention layers for action_vision
+        self.key_av = nn.Linear(bi_hidden_size, self.all_head_size)
+        self.value_av = nn.Linear(bi_hidden_size, self.all_head_size)
+        self.dropout_av = nn.Dropout(at_attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.shape[:-1] + [
+            self.num_attention_heads,
+            self.attention_head_size,
+        ]
+        x = x.reshape(new_x_shape)
+        return x.transpose((0, 2, 1, 3))
+
+    def forward(
+        self,
+        input_tensor1,
+        attention_mask1,
+        input_tensor2,
+        attention_mask2,
+        input_tensor3,
+        attention_mask3,
+    ):
+
+        # for vision input.
+        mixed_query_layer1 = self.query1(input_tensor1)
+        mixed_key_layer1 = self.key1(input_tensor1)
+        mixed_value_layer1 = self.value1(input_tensor1)
+
+        query_layer1 = self.transpose_for_scores(mixed_query_layer1)
+        key_layer1 = self.transpose_for_scores(mixed_key_layer1)
+        value_layer1 = self.transpose_for_scores(mixed_value_layer1)
+
+        # for text input:
+        mixed_query_layer2 = self.query2(input_tensor2)
+        mixed_key_layer2 = self.key2(input_tensor2)
+        mixed_value_layer2 = self.value2(input_tensor2)
+
+        query_layer2 = self.transpose_for_scores(mixed_query_layer2)
+        key_layer2 = self.transpose_for_scores(mixed_key_layer2)
+        value_layer2 = self.transpose_for_scores(mixed_value_layer2)
+
+        # for action input:
+        mixed_query_layer3 = self.query3(input_tensor3)
+        mixed_key_layer3 = self.key3(input_tensor3)
+        mixed_value_layer3 = self.value3(input_tensor3)
+
+        query_layer3 = self.transpose_for_scores(mixed_query_layer3)
+        key_layer3 = self.transpose_for_scores(mixed_key_layer3)
+        value_layer3 = self.transpose_for_scores(mixed_value_layer3)
+
+        def do_attention(query_layer, key_layer, value_layer, attention_mask,
+                         dropout):
+            """ compute attention """
+            attention_scores = paddle.matmul(query_layer,
+                                             key_layer.transpose((0, 1, 3, 2)))
+            attention_scores = attention_scores / math.sqrt(
+                self.attention_head_size)
+            attention_scores = attention_scores + attention_mask
+
+            # Normalize the attention scores to probabilities.
+            attention_probs = nn.Softmax(axis=-1)(attention_scores)
+
+            # This is actually dropping out entire tokens to attend to, which might
+            # seem a bit unusual, but is taken from the original Transformer paper.
+            attention_probs = dropout(attention_probs)
+
+            context_layer = paddle.matmul(attention_probs, value_layer)
+            context_layer = context_layer.transpose((0, 2, 1, 3))
+            new_context_layer_shape = context_layer.shape[:-2] + [
+                self.all_head_size
+            ]
+            context_layer = context_layer.reshape(new_context_layer_shape)
+            return context_layer
+
+        context_av = do_attention(query_layer3, key_layer1, value_layer1,
+                                  attention_mask1, self.dropout_av)
+        context_at = do_attention(query_layer3, key_layer2, value_layer2,
+                                  attention_mask2, self.dropout_at)
+
+        context_key_av = self.key_av(context_av).transpose((0, 2, 1))
+        # interpolate only support 4-D tensor now.
+        context_key_av = F.interpolate(context_key_av.unsqueeze(-1),
+                                       size=(key_layer2.shape[2],
+                                             1)).squeeze(-1)
+        context_key_av = self.transpose_for_scores(
+            context_key_av.transpose((0, 2, 1)))
+        key_layer2 = key_layer2 + context_key_av
+
+        context_key_at = self.key_at(context_at).transpose((0, 2, 1))
+        context_key_at = F.interpolate(context_key_at.unsqueeze(-1),
+                                       size=(key_layer1.shape[2],
+                                             1)).squeeze(-1)
+        context_key_at = self.transpose_for_scores(
+            context_key_at.transpose((0, 2, 1)))
+        key_layer1 = key_layer1 + context_key_at
+
+        context_val_av = self.value_at(context_av).transpose((0, 2, 1))
+        context_val_av = F.interpolate(context_val_av.unsqueeze(-1),
+                                       size=(value_layer2.shape[2],
+                                             1)).squeeze(-1)
+        context_val_av = self.transpose_for_scores(
+            context_val_av.transpose((0, 2, 1)))
+        value_layer2 = value_layer2 + context_val_av
+
+        context_val_at = self.value_at(context_at).transpose((0, 2, 1))
+        context_val_at = F.interpolate(context_val_at.unsqueeze(-1),
+                                       size=(value_layer1.shape[2],
+                                             1)).squeeze(-1)
+        context_val_at = self.transpose_for_scores(
+            context_val_at.transpose((0, 2, 1)))
+        value_layer1 = value_layer1 + context_val_at
+
+        context_layer1 = do_attention(query_layer1, key_layer1, value_layer1,
+                                      attention_mask1, self.dropout1)
+        context_layer2 = do_attention(query_layer2, key_layer2, value_layer2,
+                                      attention_mask2, self.dropout2)
+        context_layer3 = do_attention(query_layer3, key_layer3, value_layer3,
+                                      attention_mask3, self.dropout3)
+
+        return context_layer1, context_layer2, context_layer3  # vision, text, action
+
+
+class BertEntOutput(nn.Layer):
+    def __init__(
+        self,
+        bi_hidden_size,
+        hidden_size,
+        v_hidden_size,
+        v_hidden_dropout_prob,
+        hidden_dropout_prob,
+    ):
+        super(BertEntOutput, self).__init__()
+
+        self.dense1 = nn.Linear(bi_hidden_size, v_hidden_size)
+        self.LayerNorm1 = nn.LayerNorm(v_hidden_size, epsilon=1e-12)
+        self.dropout1 = nn.Dropout(v_hidden_dropout_prob)
+
+        self.dense2 = nn.Linear(bi_hidden_size, hidden_size)
+        self.LayerNorm2 = nn.LayerNorm(hidden_size, epsilon=1e-12)
+        self.dropout2 = nn.Dropout(hidden_dropout_prob)
+
+        self.dense3 = nn.Linear(bi_hidden_size, hidden_size)
+        self.LayerNorm3 = nn.LayerNorm(hidden_size, epsilon=1e-12)
+        self.dropout3 = nn.Dropout(hidden_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states1,
+        input_tensor1,
+        hidden_states2,
+        input_tensor2,
+        hidden_states3,
+        input_tensor3,
+    ):
+        context_state1 = self.dense1(hidden_states1)
+        context_state1 = self.dropout1(context_state1)
+
+        context_state2 = self.dense2(hidden_states2)
+        context_state2 = self.dropout2(context_state2)
+
+        context_state3 = self.dense3(hidden_states3)
+        context_state3 = self.dropout3(context_state3)
+
+        hidden_states1 = self.LayerNorm1(context_state1 + input_tensor1)
+        hidden_states2 = self.LayerNorm2(context_state2 + input_tensor2)
+        hidden_states3 = self.LayerNorm3(context_state3 + input_tensor3)
+
+        return hidden_states1, hidden_states2, hidden_states3
+
+
+class BertLayer(nn.Layer):
+    def __init__(self, hidden_size, intermediate_size, hidden_act,
+                 hidden_dropout_prob, num_attention_heads,
+                 attention_probs_dropout_prob):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(hidden_size, hidden_dropout_prob,
+                                       num_attention_heads,
+                                       attention_probs_dropout_prob)
+        self.intermediate = BertIntermediate(hidden_size, intermediate_size,
+                                             hidden_act)
+        self.output = BertOutput(intermediate_size, hidden_size,
+                                 hidden_dropout_prob)
+
+    def forward(self, hidden_states, attention_mask):
+        attention_output, attention_probs = self.attention(
+            hidden_states, attention_mask)
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output, attention_probs
+
+
+class BertConnectionLayer(nn.Layer):
+    def __init__(self, hidden_size, v_hidden_size, a_hidden_size,
+                 bi_hidden_size, bi_num_attention_heads,
+                 attention_probs_dropout_prob, v_attention_probs_dropout_prob,
+                 a_attention_probs_dropout_prob,
+                 av_attention_probs_dropout_prob,
+                 at_attention_probs_dropout_prob, intermediate_size,
+                 v_intermediate_size, a_intermediate_size, hidden_act,
+                 v_hidden_act, a_hidden_act, hidden_dropout_prob,
+                 v_hidden_dropout_prob, a_hidden_dropout_prob):
+        super(BertConnectionLayer, self).__init__()
+        self.ent_attention = BertEntAttention(
+            hidden_size,
+            v_hidden_size,
+            a_hidden_size,
+            bi_hidden_size,
+            attention_probs_dropout_prob,
+            v_attention_probs_dropout_prob,
+            a_attention_probs_dropout_prob,
+            av_attention_probs_dropout_prob,
+            at_attention_probs_dropout_prob,
+            bi_num_attention_heads,
+        )
+
+        self.ent_output = BertEntOutput(
+            bi_hidden_size,
+            hidden_size,
+            v_hidden_size,
+            v_hidden_dropout_prob,
+            hidden_dropout_prob,
+        )
+
+        self.v_intermediate = BertIntermediate(v_hidden_size,
+                                               v_intermediate_size,
+                                               v_hidden_act)
+        self.v_output = BertOutput(v_intermediate_size, v_hidden_size,
+                                   v_hidden_dropout_prob)
+
+        self.t_intermediate = BertIntermediate(hidden_size, intermediate_size,
+                                               hidden_act)
+        self.t_output = BertOutput(intermediate_size, hidden_size,
+                                   hidden_dropout_prob)
+
+        self.a_intermediate = BertIntermediate(a_hidden_size,
+                                               a_intermediate_size,
+                                               a_hidden_act)
+        self.a_output = BertOutput(a_intermediate_size, a_hidden_size,
+                                   a_hidden_dropout_prob)
+
+    def forward(
+        self,
+        input_tensor1,
+        attention_mask1,
+        input_tensor2,
+        attention_mask2,
+        input_tensor3,
+        attention_mask3,
+    ):
+
+        ent_output1, ent_output2, ent_output3 = self.ent_attention(
+            input_tensor1, attention_mask1, input_tensor2, attention_mask2,
+            input_tensor3, attention_mask3)
+
+        attention_output1, attention_output2, attention_output3 = self.ent_output(
+            ent_output1, input_tensor1, ent_output2, input_tensor2, ent_output3,
+            input_tensor3)
+
+        intermediate_output1 = self.v_intermediate(attention_output1)
+        layer_output1 = self.v_output(intermediate_output1, attention_output1)
+
+        intermediate_output2 = self.t_intermediate(attention_output2)
+        layer_output2 = self.t_output(intermediate_output2, attention_output2)
+
+        intermediate_output3 = self.a_intermediate(attention_output3)
+        layer_output3 = self.a_output(intermediate_output3, attention_output3)
+
+        return layer_output1, layer_output2, layer_output3
+
+
+class BertEncoder(nn.Layer):
+    """
+    ActBert Encoder, consists 3 pathway of multi-BertLayers and BertConnectionLayer.
+    """
+    def __init__(
+        self,
+        v_ent_attention_id,
+        t_ent_attention_id,
+        a_ent_attention_id,
+        fixed_t_layer,
+        fixed_v_layer,
+        hidden_size,
+        v_hidden_size,
+        a_hidden_size,
+        bi_hidden_size,
+        intermediate_size,
+        v_intermediate_size,
+        a_intermediate_size,
+        hidden_act,
+        v_hidden_act,
+        a_hidden_act,
+        hidden_dropout_prob,
+        v_hidden_dropout_prob,
+        a_hidden_dropout_prob,
+        attention_probs_dropout_prob,
+        v_attention_probs_dropout_prob,
+        a_attention_probs_dropout_prob,
+        av_attention_probs_dropout_prob,
+        at_attention_probs_dropout_prob,
+        num_attention_heads,
+        v_num_attention_heads,
+        a_num_attention_heads,
+        bi_num_attention_heads,
+        num_hidden_layers,
+        v_num_hidden_layers,
+        a_num_hidden_layers,
+    ):
+        super(BertEncoder, self).__init__()
+        self.v_ent_attention_id = v_ent_attention_id
+        self.t_ent_attention_id = t_ent_attention_id
+        self.a_ent_attention_id = a_ent_attention_id
+        self.fixed_t_layer = fixed_t_layer
+        self.fixed_v_layer = fixed_v_layer
+
+        layer = BertLayer(hidden_size, intermediate_size, hidden_act,
+                          hidden_dropout_prob, num_attention_heads,
+                          attention_probs_dropout_prob)
+        v_layer = BertLayer(v_hidden_size, v_intermediate_size, v_hidden_act,
+                            v_hidden_dropout_prob, v_num_attention_heads,
+                            v_attention_probs_dropout_prob)
+        a_layer = BertLayer(a_hidden_size, a_intermediate_size, a_hidden_act,
+                            a_hidden_dropout_prob, a_num_attention_heads,
+                            a_attention_probs_dropout_prob)
+        connect_layer = BertConnectionLayer(
+            hidden_size, v_hidden_size, a_hidden_size, bi_hidden_size,
+            bi_num_attention_heads, attention_probs_dropout_prob,
+            v_attention_probs_dropout_prob, a_attention_probs_dropout_prob,
+            av_attention_probs_dropout_prob, at_attention_probs_dropout_prob,
+            intermediate_size, v_intermediate_size, a_intermediate_size,
+            hidden_act, v_hidden_act, a_hidden_act, hidden_dropout_prob,
+            v_hidden_dropout_prob, a_hidden_dropout_prob)
+
+        self.layer = nn.LayerList(
+            [copy.deepcopy(layer) for _ in range(num_hidden_layers)])  #12
+        self.v_layer = nn.LayerList(
+            [copy.deepcopy(v_layer) for _ in range(v_num_hidden_layers)])  #2
+        self.a_layer = nn.LayerList(
+            [copy.deepcopy(a_layer) for _ in range(a_num_hidden_layers)])  #3
+        self.c_layer = nn.LayerList([
+            copy.deepcopy(connect_layer) for _ in range(len(v_ent_attention_id))
+        ]  #2  [0,1]
+                                    )
+
+    def forward(
+        self,
+        txt_embedding,
+        image_embedding,
+        action_embedding,
+        txt_attention_mask,
+        image_attention_mask,
+        action_attention_mask,
+        output_all_encoded_layers=True,
+    ):
+        v_start, a_start, t_start = 0, 0, 0
+        count = 0
+        all_encoder_layers_t = []
+        all_encoder_layers_v = []
+        all_encoder_layers_a = []
+
+        for v_layer_id, a_layer_id, t_layer_id in zip(self.v_ent_attention_id,
+                                                      self.a_ent_attention_id,
+                                                      self.t_ent_attention_id):
+            v_end = v_layer_id
+            a_end = a_layer_id
+            t_end = t_layer_id
+
+            assert self.fixed_t_layer <= t_end
+            assert self.fixed_v_layer <= v_end
+
+            ### region embedding
+            for idx in range(v_start,
+                             self.fixed_v_layer):  #两次训练，这个循环都没有进去  #前面的层固定住
+                with paddle.no_grad():
+                    image_embedding, image_attention_probs = self.v_layer[idx](
+                        image_embedding, image_attention_mask)
+                    v_start = self.fixed_v_layer
+            for idx in range(v_start, v_end):
+                image_embedding, image_attention_probs = self.v_layer[idx](
+                    image_embedding, image_attention_mask)
+
+            ### action embedding
+            for idx in range(a_start, a_end):
+                action_embedding, action_attention_probs = self.a_layer[idx](
+                    action_embedding, action_attention_mask)
+
+            ### text embedding
+            for idx in range(t_start, self.fixed_t_layer):
+                with paddle.no_grad():
+                    txt_embedding, txt_attention_probs = self.layer[idx](
+                        txt_embedding, txt_attention_mask)
+                    t_start = self.fixed_t_layer
+            for idx in range(t_start, t_end):
+                txt_embedding, txt_attention_probs = self.layer[idx](
+                    txt_embedding, txt_attention_mask)
+
+            image_embedding, txt_embedding, action_embedding = self.c_layer[
+                count](image_embedding, image_attention_mask, txt_embedding,
+                       txt_attention_mask, action_embedding,
+                       action_attention_mask)
+
+            v_start = v_end
+            t_start = t_end
+            a_start = a_end
+            count += 1
+
+            if output_all_encoded_layers:
+                all_encoder_layers_t.append(txt_embedding)
+                all_encoder_layers_v.append(image_embedding)
+                all_encoder_layers_a.append(action_embedding)
+
+        for idx in range(v_start, len(self.v_layer)):  # 1
+            image_embedding, image_attention_probs = self.v_layer[idx](
+                image_embedding, image_attention_mask)
+
+        for idx in range(a_start, len(self.a_layer)):
+            action_embedding, action_attention_probs = self.a_layer[idx](
+                action_embedding, action_attention_mask)
+
+        for idx in range(t_start, len(self.layer)):
+            txt_embedding, txt_attention_probs = self.layer[idx](
+                txt_embedding, txt_attention_mask)
+
+        # add the end part to finish.
+        if not output_all_encoded_layers:
+            all_encoder_layers_t.append(txt_embedding)  #8, 36, 768
+            all_encoder_layers_v.append(image_embedding)  #8, 37, 1024
+            all_encoder_layers_a.append(action_embedding)  #8, 5, 768
+
+        return all_encoder_layers_t, all_encoder_layers_v, all_encoder_layers_a
+
+
+class BertPooler(nn.Layer):
+    """ "Pool" the model by simply taking the hidden state corresponding
+        to the first token.
+    """
+    def __init__(self, hidden_size, bi_hidden_size):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(hidden_size, bi_hidden_size)
+        self.activation = nn.ReLU()
+
+    def forward(self, hidden_states):
+        first_token_tensor = hidden_states[:, 0]  #8, 768
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertModel(nn.Layer):
+    def __init__(
+        self,
+        vocab_size,
+        max_position_embeddings,
+        type_vocab_size,
+        v_feature_size,
+        a_feature_size,
+        num_hidden_layers,
+        v_num_hidden_layers,
+        a_num_hidden_layers,
+        v_ent_attention_id,
+        t_ent_attention_id,
+        a_ent_attention_id,
+        fixed_t_layer,
+        fixed_v_layer,
+        hidden_size,
+        v_hidden_size,
+        a_hidden_size,
+        bi_hidden_size,
+        intermediate_size,
+        v_intermediate_size,
+        a_intermediate_size,
+        hidden_act,
+        v_hidden_act,
+        a_hidden_act,
+        hidden_dropout_prob,
+        v_hidden_dropout_prob,
+        a_hidden_dropout_prob,
+        attention_probs_dropout_prob,
+        v_attention_probs_dropout_prob,
+        a_attention_probs_dropout_prob,
+        av_attention_probs_dropout_prob,
+        at_attention_probs_dropout_prob,
+        num_attention_heads,
+        v_num_attention_heads,
+        a_num_attention_heads,
+        bi_num_attention_heads,
+    ):
+        super(BertModel, self).__init__()
+        # initilize word embedding
+        self.embeddings = BertEmbeddings(vocab_size, max_position_embeddings,
+                                         type_vocab_size, hidden_size,
+                                         hidden_dropout_prob)
+        # initlize the region embedding
+        self.v_embeddings = BertImageEmbeddings(v_feature_size, v_hidden_size,
+                                                v_hidden_dropout_prob)
+        # initlize the action embedding
+        self.a_embeddings = BertActionEmbeddings(a_feature_size, a_hidden_size,
+                                                 a_hidden_dropout_prob)
+
+        self.encoder = BertEncoder(
+            v_ent_attention_id, t_ent_attention_id, a_ent_attention_id,
+            fixed_t_layer, fixed_v_layer, hidden_size, v_hidden_size,
+            a_hidden_size, bi_hidden_size, intermediate_size,
+            v_intermediate_size, a_intermediate_size, hidden_act, v_hidden_act,
+            a_hidden_act, hidden_dropout_prob, v_hidden_dropout_prob,
+            a_hidden_dropout_prob, attention_probs_dropout_prob,
+            v_attention_probs_dropout_prob, a_attention_probs_dropout_prob,
+            av_attention_probs_dropout_prob, at_attention_probs_dropout_prob,
+            num_attention_heads, v_num_attention_heads, a_num_attention_heads,
+            bi_num_attention_heads, num_hidden_layers, v_num_hidden_layers,
+            a_num_hidden_layers)
+
+        self.t_pooler = BertPooler(hidden_size, bi_hidden_size)
+        self.v_pooler = BertPooler(v_hidden_size, bi_hidden_size)
+        self.a_pooler = BertPooler(a_hidden_size, bi_hidden_size)
+
+    def forward(
+        self,
+        text_ids,
+        action_feat,
+        image_feat,
+        image_loc,
+        token_type_ids=None,
+        text_mask=None,
+        image_mask=None,
+        action_mask=None,
+        output_all_encoded_layers=False,
+    ):
+        """
+        text_ids: input text ids. Shape: [batch_size, seqence_length]
+        action_feat: input action feature. Shape: [batch_size, action_length, action_feature_dim]
+        image_feat: input image feature. Shape: [batch_size, region_length, image_feature_dim]]
+        image_loc: input region location. Shape: [batch_size, region_length, region_location_dim]
+        token_type_ids: segment ids of each video clip. Shape: [batch_size, seqence_length]
+        text_mask: text mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, seqence_length]
+        image_mask: image mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, region_length]
+        action_mask: action mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, action_length]
+        output_all_encoded_layers: is output encoded layers feature or not. Type: Bool.
+        """
+        if text_mask is None:
+            text_mask = paddle.ones_like(text_ids)
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(text_ids)
+        if image_mask is None:
+            image_mask = paddle.ones(image_feat.shape[0],
+                                     image_feat.shape[1]).astype(text_ids.dtype)
+        if action_mask is None:
+            action_mask = paddle.ones(action_feat.shape[0],
+                                      action_feat.shape[1]).astype(
+                                          text_ids.dtype)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length].
+        extended_text_mask = text_mask.unsqueeze(1).unsqueeze(2)
+        extended_image_mask = image_mask.unsqueeze(1).unsqueeze(2)
+        extended_action_mask = action_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        def set_mask(extended_attention_mask):
+            extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+            return extended_attention_mask
+
+        extended_text_mask = set_mask(extended_text_mask)
+        extended_image_mask = set_mask(extended_image_mask)
+        extended_action_mask = set_mask(extended_action_mask)
+
+        t_embedding_output = self.embeddings(text_ids, token_type_ids)
+        v_embedding_output = self.v_embeddings(image_feat, image_loc)
+        a_embedding_output = self.a_embeddings(action_feat)
+
+        # var = [t_embedding_output, v_embedding_output, a_embedding_output]
+        # import numpy as np
+        # for i, item in enumerate(var):
+        #     np.save('tmp/' + str(i)+'.npy', item.numpy())
+
+        encoded_layers_t, encoded_layers_v, encoded_layers_a = self.encoder(
+            t_embedding_output,
+            v_embedding_output,
+            a_embedding_output,
+            extended_text_mask,
+            extended_image_mask,
+            extended_action_mask,
+            output_all_encoded_layers=output_all_encoded_layers,
+        )
+
+        sequence_output_t = encoded_layers_t[-1]  #get item from list
+        sequence_output_v = encoded_layers_v[-1]
+        sequence_output_a = encoded_layers_a[-1]
+
+        pooled_output_t = self.t_pooler(sequence_output_t)
+        pooled_output_v = self.v_pooler(sequence_output_v)
+        pooled_output_a = self.a_pooler(sequence_output_a)
+
+        if not output_all_encoded_layers:
+            encoded_layers_t = encoded_layers_t[-1]
+            encoded_layers_v = encoded_layers_v[-1]
+            encoded_layers_a = encoded_layers_a[-1]
+
+        return encoded_layers_t, encoded_layers_v, encoded_layers_a, \
+            pooled_output_t, pooled_output_v, pooled_output_a
+
+
+# For Head
+class BertPredictionHeadTransform(nn.Layer):
+    def __init__(self, hidden_size, hidden_act):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        if isinstance(hidden_act, str) or (sys.version_info[0] == 2
+                                           and isinstance(hidden_act, str)):
+            self.transform_act_fn = ACT2FN[hidden_act]
+        else:
+            self.transform_act_fn = hidden_act
+        self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Layer):
+    def __init__(self, hidden_size, hidden_act, bert_model_embedding_weights):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(hidden_size, hidden_act)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        assert bert_model_embedding_weights.shape[1] == hidden_size
+        vocab_size = bert_model_embedding_weights.shape[0]
+
+        # another implementation which would create another big params:
+        # self.decoder = nn.Linear(hidden_size, vocab_size)   # NOTE bias default: constant 0.0
+        # self.decoder.weight = self.create_parameter(shape=[hidden_size, vocab_size],
+        #                                             default_initializer=nn.initializer.Assign(
+        #                                                 bert_model_embedding_weights.t()))  # transpose
+
+        self.decoder_weight = bert_model_embedding_weights
+        self.decoder_bias = self.create_parameter(
+            shape=[vocab_size],
+            dtype=bert_model_embedding_weights.dtype,
+            is_bias=True)  # NOTE bias default: constant 0.0
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = paddle.tensor.matmul(
+            hidden_states, self.decoder_weight,
+            transpose_y=True) + self.decoder_bias
+        return hidden_states
+
+
+class BertImageActionPredictionHead(nn.Layer):
+    def __init__(self, hidden_size, hidden_act, target_size):
+        super(BertImageActionPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(hidden_size, hidden_act)
+
+        self.decoder = nn.Linear(hidden_size, target_size)
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertPreTrainingHeads(nn.Layer):
+    def __init__(self, hidden_size, v_hidden_size, a_hidden_size,
+                 bi_hidden_size, hidden_act, v_hidden_act, a_hidden_act,
+                 v_target_size, a_target_size, fusion_method,
+                 bert_model_embedding_weights):
+        super(BertPreTrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(hidden_size, hidden_act,
+                                                bert_model_embedding_weights)
+        self.seq_relationship = nn.Linear(bi_hidden_size, 2)
+        self.imagePredictions = BertImageActionPredictionHead(
+            v_hidden_size, v_hidden_act, v_target_size)  # visual class number
+        self.actionPredictions = BertImageActionPredictionHead(
+            a_hidden_size, a_hidden_act, a_target_size)  # action class number
+        self.fusion_method = fusion_method
+        self.dropout = nn.Dropout(0.1)
+
+    def forward(self, sequence_output_t, sequence_output_v, sequence_output_a,
+                pooled_output_t, pooled_output_v, pooled_output_a):
+
+        if self.fusion_method == 'sum':
+            pooled_output = self.dropout(pooled_output_t + pooled_output_v +
+                                         pooled_output_a)
+        elif self.fusion_method == 'mul':
+            pooled_output = self.dropout(pooled_output_t * pooled_output_v +
+                                         pooled_output_a)
+        else:
+            assert False
+
+        prediction_scores_t = self.predictions(
+            sequence_output_t)  # 8， 36 ，30522
+        seq_relationship_score = self.seq_relationship(pooled_output)  # 8, 2
+        prediction_scores_v = self.imagePredictions(
+            sequence_output_v)  # 8, 37, 1601
+        prediction_scores_a = self.actionPredictions(
+            sequence_output_a)  # 8, 5, 401
+
+        return prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score
+
+
+@BACKBONES.register()
+class BertForMultiModalPreTraining(nn.Layer):
+    """BERT model with multi modal pre-training heads.
+    """
+    def __init__(
+        self,
+        vocab_size=30522,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        v_target_size=1601,
+        a_target_size=700,
+        v_feature_size=2048,
+        a_feature_size=2048,
+        num_hidden_layers=12,
+        v_num_hidden_layers=2,
+        a_num_hidden_layers=3,
+        t_ent_attention_id=[10, 11],
+        v_ent_attention_id=[0, 1],
+        a_ent_attention_id=[0, 1],
+        fixed_t_layer=0,
+        fixed_v_layer=0,
+        hidden_size=768,
+        v_hidden_size=1024,
+        a_hidden_size=768,
+        bi_hidden_size=1024,
+        intermediate_size=3072,
+        v_intermediate_size=1024,
+        a_intermediate_size=3072,
+        hidden_act="gelu",
+        v_hidden_act="gelu",
+        a_hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        v_hidden_dropout_prob=0.1,
+        a_hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        v_attention_probs_dropout_prob=0.1,
+        a_attention_probs_dropout_prob=0.1,
+        av_attention_probs_dropout_prob=0.1,
+        at_attention_probs_dropout_prob=0.1,
+        num_attention_heads=12,
+        v_num_attention_heads=8,
+        a_num_attention_heads=12,
+        bi_num_attention_heads=8,
+        fusion_method="mul",
+        pretrained=None,
+    ):
+        """
+        vocab_size: vocabulary size. Default: 30522.
+        max_position_embeddings: max position id. Default: 512.
+        type_vocab_size: max segment id. Default: 2.
+        v_target_size: class number of visual word. Default: 1601.
+        a_target_size: class number of action word. Default: 700.
+        v_feature_size: input visual feature dimension. Default: 2048.
+        a_feature_size: input action feature dimension. Default: 2048.
+        num_hidden_layers: number of BertLayer in text transformer. Default: 12.
+        v_num_hidden_layers: number of BertLayer in visual transformer. Default: 2.
+        a_num_hidden_layers: number of BertLayer in action transformer. Default:3.
+        t_ent_attention_id: index id of BertConnectionLayer in text transformer. Default: [10, 11].
+        v_ent_attention_id: index id of BertConnectionLayer in visual transformer. Default:[0, 1].
+        a_ent_attention_id: index id of BertConnectionLayer in action transformer. Default:[0, 1].
+        fixed_t_layer: index id of fixed BertLayer in text transformer. Default: 0.
+        fixed_v_layer: index id of fixed BertLayer in visual transformer. Default: 0.
+        hidden_size: hidden size in text BertLayer. Default: 768.
+        v_hidden_size: hidden size in visual BertLayer. Default: 1024.
+        a_hidden_size: hidden size in action BertLayer. Default: 768.
+        bi_hidden_size: hidden size in BertConnectionLayer. Default: 1024,
+        intermediate_size: intermediate size in text BertLayer. Default: 3072.
+        v_intermediate_size: intermediate size in visual BertLayer. Default: 1024.
+        a_intermediate_size: intermediate size in text BertLayer. Default: 3072.
+        hidden_act: hidden activation function in text BertLayer. Default: "gelu".
+        v_hidden_act: hidden activation function in visual BertLayer. Default: "gelu".
+        a_hidden_act: hidden activation function in action BertLayer. Default: "gelu".
+        hidden_dropout_prob: hidden dropout probability in text Embedding Layer. Default: 0.1
+        v_hidden_dropout_prob: hidden dropout probability in visual Embedding Layer. Default: 0.1
+        a_hidden_dropout_prob: hidden dropout probability in action Embedding Layer. Default: 0.1
+        attention_probs_dropout_prob: attention dropout probability in text BertLayer. Default: 0.1
+        v_attention_probs_dropout_prob: attention dropout probability in visual BertLayer. Default: 0.1
+        a_attention_probs_dropout_prob: attention dropout probability in action BertLayer. Default: 0.1
+        av_attention_probs_dropout_prob: attention dropout probability in action-visual BertConnectionLayer. Default: 0.1
+        at_attention_probs_dropout_prob: attention dropout probability in action-text BertConnectionLayer. Default: 0.1
+        num_attention_heads: number of heads in text BertLayer. Default: 12.
+        v_num_attention_heads: number of heads in visual BertLayer. Default: 8.
+        a_num_attention_heads: number of heads in action BertLayer. Default: 12.
+        bi_num_attention_heads: number of heads in BertConnectionLayer. Default: 8.
+        fusion_method: methods of fusing pooled output from 3 transformer. Default: "mul".
+        """
+        super(BertForMultiModalPreTraining, self).__init__()
+        self.pretrained = pretrained
+        self.vocab_size = vocab_size
+        self.a_target_size = a_target_size
+
+        self.bert = BertModel(
+            vocab_size,
+            max_position_embeddings,
+            type_vocab_size,
+            v_feature_size,
+            a_feature_size,
+            num_hidden_layers,
+            v_num_hidden_layers,
+            a_num_hidden_layers,
+            v_ent_attention_id,
+            t_ent_attention_id,
+            a_ent_attention_id,
+            fixed_t_layer,
+            fixed_v_layer,
+            hidden_size,
+            v_hidden_size,
+            a_hidden_size,
+            bi_hidden_size,
+            intermediate_size,
+            v_intermediate_size,
+            a_intermediate_size,
+            hidden_act,
+            v_hidden_act,
+            a_hidden_act,
+            hidden_dropout_prob,
+            v_hidden_dropout_prob,
+            a_hidden_dropout_prob,
+            attention_probs_dropout_prob,
+            v_attention_probs_dropout_prob,
+            a_attention_probs_dropout_prob,
+            av_attention_probs_dropout_prob,
+            at_attention_probs_dropout_prob,
+            num_attention_heads,
+            v_num_attention_heads,
+            a_num_attention_heads,
+            bi_num_attention_heads,
+        )
+        self.cls = BertPreTrainingHeads(
+            hidden_size, v_hidden_size, a_hidden_size, bi_hidden_size,
+            hidden_act, v_hidden_act, a_hidden_act, v_target_size,
+            a_target_size, fusion_method,
+            self.bert.embeddings.word_embeddings.weight)
+
+    def init_weights(self):
+        """Initiate the parameters.
+        """
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            for layer in self.sublayers():
+                if isinstance(layer, (nn.Linear, nn.Embedding)):
+                    weight_init_(layer, 'Normal', std=0.02)
+                elif isinstance(layer, nn.LayerNorm):
+                    weight_init_(layer, 'Constant', value=1)
+
+    def forward(
+            self,
+            text_ids,  #8,36
+            action_feat,  #8,5,2048
+            image_feat,  #8,37,2048
+            image_loc,  #8,37,5
+            token_type_ids=None,  #8,36
+            text_mask=None,  #8,36
+            image_mask=None,  #8,37
+            action_mask=None,  #8,5
+    ):
+        """
+        text_ids: input text ids. Shape: [batch_size, seqence_length]
+        action_feat: input action feature. Shape: [batch_size, action_length, action_feature_dim]
+        image_feat: input image feature. Shape: [batch_size, region_length+1, image_feature_dim]], add 1 for image global feature.
+        image_loc: input region location. Shape: [batch_size, region_length+1, region_location_dim], add 1 for image global feature location.
+        token_type_ids: segment ids of each video clip. Shape: [batch_size, seqence_length]
+        text_mask: text mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, seqence_length]
+        image_mask: image mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, region_length]
+        action_mask: action mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, action_length]
+        """
+        sequence_output_t, sequence_output_v, sequence_output_a, \
+        pooled_output_t, pooled_output_v, pooled_output_a = self.bert(
+            text_ids,
+            action_feat,
+            image_feat,
+            image_loc,
+            token_type_ids,
+            text_mask,
+            image_mask,
+            action_mask,
+            output_all_encoded_layers=False,
+        )
+
+        prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score = self.cls(
+            sequence_output_t, sequence_output_v, sequence_output_a,
+            pooled_output_t, pooled_output_v, pooled_output_a)
+
+        return prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/adds.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/adds.py
new file mode 100644
index 0000000..21cd212
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/adds.py
@@ -0,0 +1,1146 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections import OrderedDict
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import BatchNorm2D, Conv2D
+from paddle.nn.initializer import Constant, Normal
+from paddle.vision.models import ResNet
+
+from ...utils import load_ckpt
+from ..registry import BACKBONES
+from ..weight_init import kaiming_normal_, _calculate_fan_in_and_fan_out
+
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+normal_ = Normal(mean=0, std=1e-3)
+
+
+def disp_to_depth(disp, min_depth, max_depth):
+    """Convert network's sigmoid output into depth prediction
+    The formula for this conversion is given in the 'additional considerations'
+    section of the paper.
+    """
+    min_disp = 1 / max_depth
+    max_disp = 1 / min_depth
+    scaled_disp = min_disp + (max_disp - min_disp) * disp
+    depth = 1 / scaled_disp
+    return scaled_disp, depth
+
+
+def gram_matrix(y):
+    (b, ch, h, w) = y.shape
+    features = y.reshape([b, ch, w * h])
+    features_t = paddle.transpose(features, [0, 2, 1])
+    gram = features.bmm(features_t) / (ch * h * w)
+    return gram
+
+
+def convt_bn_relu(in_channels,
+                  out_channels,
+                  kernel_size,
+                  stride=1,
+                  padding=0,
+                  output_padding=0,
+                  bn=True,
+                  relu=True):
+    bias = not bn
+    layers = []
+    layers.append(
+        nn.Conv2DTranspose(in_channels,
+                           out_channels,
+                           kernel_size,
+                           stride,
+                           padding,
+                           output_padding,
+                           bias_attr=bias))
+    if bn:
+        layers.append(nn.BatchNorm2D(out_channels))
+
+    if relu:
+        layers.append(nn.LeakyReLU(0.2))
+    layers = nn.Sequential(*layers)
+
+    # initialize the weights
+    for m in layers.sublayers(include_self=True):
+        if isinstance(m, nn.Conv2DTranspose):
+            normal_(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.BatchNorm2D):
+            ones_(m.weight)
+            zeros_(m.bias)
+    return layers
+
+
+def transformation_from_parameters(axisangle, translation, invert=False):
+    """Convert the network's (axisangle, translation) output into a 4x4 matrix
+    """
+    R = rot_from_axisangle(axisangle)
+    t = translation.clone()
+
+    if invert:
+        R = R.transpose([0, 2, 1])
+        t *= -1
+
+    T = get_translation_matrix(t)
+
+    if invert:
+        M = paddle.matmul(R, T)
+    else:
+        M = paddle.matmul(T, R)
+
+    return M
+
+
+def get_translation_matrix(translation_vector):
+    """Convert a translation vector into a 4x4 transformation matrix
+    """
+    t = translation_vector.reshape([-1, 3, 1])
+    gather_object = paddle.stack([
+        paddle.zeros([
+            translation_vector.shape[0],
+        ], paddle.float32),
+        paddle.ones([
+            translation_vector.shape[0],
+        ], paddle.float32),
+        paddle.squeeze(t[:, 0], axis=-1),
+        paddle.squeeze(t[:, 1], axis=-1),
+        paddle.squeeze(t[:, 2], axis=-1),
+    ])
+    gather_index = paddle.to_tensor([
+        [1],
+        [0],
+        [0],
+        [2],
+        [0],
+        [1],
+        [0],
+        [3],
+        [0],
+        [0],
+        [1],
+        [4],
+        [0],
+        [0],
+        [0],
+        [1],
+    ])
+    T = paddle.gather_nd(gather_object, gather_index)
+    T = T.reshape([4, 4, -1]).transpose((2, 0, 1))
+    return T
+
+
+def rot_from_axisangle(vec):
+    """Convert an axisangle rotation into a 4x4 transformation matrix
+    (adapted from https://github.com/Wallacoloo/printipi)
+    Input 'vec' has to be Bx1x3
+    """
+    angle = paddle.norm(vec, 2, 2, True)
+    axis = vec / (angle + 1e-7)
+
+    ca = paddle.cos(angle)
+    sa = paddle.sin(angle)
+    C = 1 - ca
+
+    x = axis[..., 0].unsqueeze(1)
+    y = axis[..., 1].unsqueeze(1)
+    z = axis[..., 2].unsqueeze(1)
+
+    xs = x * sa
+    ys = y * sa
+    zs = z * sa
+    xC = x * C
+    yC = y * C
+    zC = z * C
+    xyC = x * yC
+    yzC = y * zC
+    zxC = z * xC
+
+    gather_object = paddle.stack([
+        paddle.squeeze(x * xC + ca, axis=(-1, -2)),
+        paddle.squeeze(xyC - zs, axis=(-1, -2)),
+        paddle.squeeze(zxC + ys, axis=(-1, -2)),
+        paddle.squeeze(xyC + zs, axis=(-1, -2)),
+        paddle.squeeze(y * yC + ca, axis=(-1, -2)),
+        paddle.squeeze(yzC - xs, axis=(-1, -2)),
+        paddle.squeeze(zxC - ys, axis=(-1, -2)),
+        paddle.squeeze(yzC + xs, axis=(-1, -2)),
+        paddle.squeeze(z * zC + ca, axis=(-1, -2)),
+        paddle.ones([
+            vec.shape[0],
+        ], dtype=paddle.float32),
+        paddle.zeros([
+            vec.shape[0],
+        ], dtype=paddle.float32)
+    ])
+    gather_index = paddle.to_tensor([
+        [0],
+        [1],
+        [2],
+        [10],
+        [3],
+        [4],
+        [5],
+        [10],
+        [6],
+        [7],
+        [8],
+        [10],
+        [10],
+        [10],
+        [10],
+        [9],
+    ])
+    rot = paddle.gather_nd(gather_object, gather_index)
+    rot = rot.reshape([4, 4, -1]).transpose((2, 0, 1))
+    return rot
+
+
+def upsample(x):
+    """Upsample input tensor by a factor of 2
+    """
+    return F.interpolate(x, scale_factor=2, mode="nearest")
+
+
+def get_smooth_loss(disp, img):
+    """Computes the smoothness loss for a disparity image
+    The color image is used for edge-aware smoothness
+    """
+    grad_disp_x = paddle.abs(disp[:, :, :, :-1] - disp[:, :, :, 1:])
+    grad_disp_y = paddle.abs(disp[:, :, :-1, :] - disp[:, :, 1:, :])
+
+    grad_img_x = paddle.mean(paddle.abs(img[:, :, :, :-1] - img[:, :, :, 1:]),
+                             1,
+                             keepdim=True)
+    grad_img_y = paddle.mean(paddle.abs(img[:, :, :-1, :] - img[:, :, 1:, :]),
+                             1,
+                             keepdim=True)
+
+    grad_disp_x *= paddle.exp(-grad_img_x)
+    grad_disp_y *= paddle.exp(-grad_img_y)
+
+    return grad_disp_x.mean() + grad_disp_y.mean()
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2D(in_planes,
+                     out_planes,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=dilation,
+                     groups=groups,
+                     bias_attr=False,
+                     dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2D(in_planes,
+                     out_planes,
+                     kernel_size=1,
+                     stride=stride,
+                     bias_attr=False)
+
+
+def resnet_multiimage_input(num_layers, num_input_images=1):
+    """Constructs a ResNet model.
+    Args:
+        num_layers (int): Number of resnet layers. Must be 18 or 50
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        num_input_images (int): Number of frames stacked as input
+    """
+    assert num_layers in [18, 50], "Can only run with 18 or 50 layer resnet"
+    blocks = {18: [2, 2, 2, 2], 50: [3, 4, 6, 3]}[num_layers]
+
+    block_type = {18: BasicBlock, 50: Bottleneck}[num_layers]
+
+    model = ResNetMultiImageInput(block_type,
+                                  num_layers,
+                                  blocks,
+                                  num_input_images=num_input_images)
+    model.init_weights()
+    return model
+
+
+class ConvBlock(nn.Layer):
+    """Layer to perform a convolution followed by ELU
+    """
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock, self).__init__()
+
+        self.conv = Conv3x3(in_channels, out_channels)
+        self.nonlin = nn.ELU()
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.nonlin(out)
+        return out
+
+
+class Conv3x3(nn.Layer):
+    """Layer to pad and convolve input
+    """
+    def __init__(self, in_channels, out_channels, use_refl=True):
+        super(Conv3x3, self).__init__()
+
+        if use_refl:
+            self.pad = nn.Pad2D(1, mode='reflect')
+        else:
+            self.pad = nn.Pad2D(1)
+        self.conv = nn.Conv2D(int(in_channels), int(out_channels), 3)
+
+    def forward(self, x):
+        out = self.pad(x)
+        out = self.conv(out)
+        return out
+
+
+class BackprojectDepth(nn.Layer):
+    """Layer to transform a depth image into a point cloud
+    """
+    def __init__(self, batch_size, height, width):
+        super(BackprojectDepth, self).__init__()
+
+        self.batch_size = batch_size
+        self.height = height
+        self.width = width
+
+        meshgrid = np.meshgrid(range(self.width),
+                               range(self.height),
+                               indexing='xy')
+        id_coords = np.stack(meshgrid, axis=0).astype(np.float32)
+        self.id_coords = self.create_parameter(shape=list(id_coords.shape),
+                                               dtype=paddle.float32)
+        self.id_coords.set_value(id_coords)
+        self.add_parameter("id_coords", self.id_coords)
+        self.id_coords.stop_gradient = True
+
+        self.ones = self.create_parameter(
+            shape=[self.batch_size, 1, self.height * self.width],
+            default_initializer=ones_)
+        self.add_parameter("ones", self.ones)
+        self.ones.stop_gradient = True
+
+        pix_coords = paddle.unsqueeze(
+            paddle.stack([
+                self.id_coords[0].reshape([
+                    -1,
+                ]), self.id_coords[1].reshape([
+                    -1,
+                ])
+            ], 0), 0)
+        pix_coords = pix_coords.tile([batch_size, 1, 1])
+        pix_coords = paddle.concat([pix_coords, self.ones], 1)
+        self.pix_coords = self.create_parameter(shape=list(pix_coords.shape), )
+        self.pix_coords.set_value(pix_coords)
+        self.add_parameter("pix_coords", self.pix_coords)
+        self.pix_coords.stop_gradient = True
+
+    def forward(self, depth, inv_K):
+        cam_points = paddle.matmul(inv_K[:, :3, :3], self.pix_coords)
+        cam_points = depth.reshape([self.batch_size, 1, -1]) * cam_points
+        cam_points = paddle.concat([cam_points, self.ones], 1)
+
+        return cam_points
+
+
+class Project3D(nn.Layer):
+    """Layer which projects 3D points into a camera with intrinsics K and at position T
+    """
+    def __init__(self, batch_size, height, width, eps=1e-7):
+        super(Project3D, self).__init__()
+
+        self.batch_size = batch_size
+        self.height = height
+        self.width = width
+        self.eps = eps
+
+    def forward(self, points, K, T):
+        P = paddle.matmul(K, T)[:, :3, :]
+
+        cam_points = paddle.matmul(P, points)
+
+        pix_coords = cam_points[:, :2, :] / (cam_points[:, 2, :].unsqueeze(1) +
+                                             self.eps)
+        pix_coords = pix_coords.reshape(
+            [self.batch_size, 2, self.height, self.width])
+        pix_coords = pix_coords.transpose([0, 2, 3, 1])
+        pix_coords[..., 0] /= self.width - 1
+        pix_coords[..., 1] /= self.height - 1
+        pix_coords = (pix_coords - 0.5) * 2
+        return pix_coords
+
+
+class SSIM(nn.Layer):
+    """Layer to compute the SSIM loss between a pair of images
+    """
+    def __init__(self):
+        super(SSIM, self).__init__()
+        self.mu_x_pool = nn.AvgPool2D(3, 1, exclusive=False)
+        self.mu_y_pool = nn.AvgPool2D(3, 1, exclusive=False)
+        self.sig_x_pool = nn.AvgPool2D(3, 1, exclusive=False)
+        self.sig_y_pool = nn.AvgPool2D(3, 1, exclusive=False)
+        self.sig_xy_pool = nn.AvgPool2D(3, 1, exclusive=False)
+
+        self.refl = nn.Pad2D(1, mode='reflect')
+
+        self.C1 = 0.01**2
+        self.C2 = 0.03**2
+
+    def forward(self, x, y):
+        x = self.refl(x)
+        y = self.refl(y)
+
+        mu_x = self.mu_x_pool(x)
+        mu_y = self.mu_y_pool(y)
+
+        sigma_x = self.sig_x_pool(x**2) - mu_x**2
+        sigma_y = self.sig_y_pool(y**2) - mu_y**2
+        sigma_xy = self.sig_xy_pool(x * y) - mu_x * mu_y
+
+        SSIM_n = (2 * mu_x * mu_y + self.C1) * (2 * sigma_xy + self.C2)
+        SSIM_d = (mu_x**2 + mu_y**2 + self.C1) * (sigma_x + sigma_y + self.C2)
+
+        return paddle.clip((1 - SSIM_n / SSIM_d) / 2, 0, 1)
+
+
+class ResNetMultiImageInput(ResNet):
+    """Constructs a resnet model with varying number of input images.
+    Adapted from https://github.com/pypaddle/vision/blob/master/paddlevision/models/resnet.py
+    """
+    def __init__(self, block, depth, layers, num_input_images=1):
+        super(ResNetMultiImageInput, self).__init__(block, depth)
+        self.inplanes = 64
+        self.conv1 = nn.Conv2D(num_input_images * 3,
+                               64,
+                               kernel_size=7,
+                               stride=2,
+                               padding=3,
+                               bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(64)
+        self.relu = nn.ReLU()
+        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+
+    def init_weights(self):
+        for layer in self.sublayers(include_self=True):
+            if isinstance(layer, nn.Conv2D):
+                kaiming_normal_(layer.weight,
+                                mode='fan_out',
+                                nonlinearity='relu')
+            elif isinstance(layer, nn.BatchNorm2D):
+                ones_(layer.weight)
+                zeros_(layer.bias)
+
+
+class ConvBNLayer(nn.Layer):
+    """Conv2D and BatchNorm2D layer.
+
+    Args:
+        in_channels (int): Number of channels for the input.
+        out_channels (int): Number of channels for the output.
+        kernel_size (int): Kernel size.
+        stride (int): Stride in the Conv2D layer. Default: 1.
+        groups (int): Groups in the Conv2D, Default: 1.
+        act (str): Indicate activation after BatchNorm2D layer.
+        name (str): the name of an instance of ConvBNLayer.
+
+    Note: weight and bias initialization include initialize values
+    and name the restored parameters, values initialization
+    are explicit declared in the ```init_weights``` method.
+
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(in_channels=in_channels,
+                            out_channels=out_channels,
+                            kernel_size=kernel_size,
+                            stride=stride,
+                            padding=(kernel_size - 1) // 2,
+                            groups=groups,
+                            bias_attr=False)
+
+        self._act = act
+
+        self._batch_norm = BatchNorm2D(out_channels)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if self._act:
+            y = getattr(paddle.nn.functional, self._act)(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2D
+        if groups != 1 or base_width != 64:
+            raise ValueError(
+                'BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError(
+                "Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU()
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Layer):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2D
+        width = int(planes * (base_width / 64.)) * groups
+
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU()
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class DepthDecoder(nn.Layer):
+    def __init__(self,
+                 num_ch_enc,
+                 scales=range(4),
+                 num_output_channels=1,
+                 use_skips=True):
+        super(DepthDecoder, self).__init__()
+
+        self.num_output_channels = num_output_channels
+        self.use_skips = use_skips
+        self.upsample_mode = 'nearest'
+        self.scales = scales
+
+        self.num_ch_enc = num_ch_enc
+        self.num_ch_dec = np.array([16, 32, 64, 128, 256])
+
+        # decoder
+        self.convs = OrderedDict()
+        for i in range(4, -1, -1):
+            # upconv_0
+            num_ch_in = self.num_ch_enc[-1] if i == 4 else self.num_ch_dec[i +
+                                                                           1]
+            num_ch_out = self.num_ch_dec[i]
+            self.convs[("upconv", i, 0)] = ConvBlock(num_ch_in, num_ch_out)
+
+            # upconv_1
+            num_ch_in = self.num_ch_dec[i]
+            if self.use_skips and i > 0:
+                num_ch_in += self.num_ch_enc[i - 1]
+            num_ch_out = self.num_ch_dec[i]
+            self.convs[("upconv", i, 1)] = ConvBlock(num_ch_in, num_ch_out)
+
+        for s in self.scales:
+            self.convs[("dispconv", s)] = Conv3x3(self.num_ch_dec[s],
+                                                  self.num_output_channels)
+
+        self.decoder = nn.LayerList(list(self.convs.values()))
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, input_features):
+        outputs = {}
+
+        # decoder
+        x = input_features[-1]
+        for i in range(4, -1, -1):
+            x = self.convs[("upconv", i, 0)](x)
+            x = [upsample(x)]
+            if self.use_skips and i > 0:
+                x += [input_features[i - 1]]
+            x = paddle.concat(x, 1)
+            x = self.convs[("upconv", i, 1)](x)
+            if i in self.scales:
+                outputs[("disp", i)] = self.sigmoid(self.convs[("dispconv",
+                                                                i)](x))
+        return outputs
+
+
+class PoseDecoder(nn.Layer):
+    def __init__(self,
+                 num_ch_enc,
+                 num_input_features,
+                 num_frames_to_predict_for=None,
+                 stride=1):
+        super(PoseDecoder, self).__init__()
+
+        self.num_ch_enc = num_ch_enc
+        self.num_input_features = num_input_features
+
+        if num_frames_to_predict_for is None:
+            num_frames_to_predict_for = num_input_features - 1
+        self.num_frames_to_predict_for = num_frames_to_predict_for
+
+        self.convs = OrderedDict()
+        self.convs[("squeeze")] = nn.Conv2D(self.num_ch_enc[-1], 256, 1)
+        self.convs[("pose", 0)] = nn.Conv2D(num_input_features * 256, 256, 3,
+                                            stride, 1)
+        self.convs[("pose", 1)] = nn.Conv2D(256, 256, 3, stride, 1)
+        self.convs[("pose", 2)] = nn.Conv2D(256, 6 * num_frames_to_predict_for,
+                                            1)
+
+        self.relu = nn.ReLU()
+
+        self.net = nn.LayerList(list(self.convs.values()))
+
+    def forward(self, input_features):
+        last_features = [f[-1] for f in input_features]
+
+        cat_features = [
+            self.relu(self.convs["squeeze"](f)) for f in last_features
+        ]
+        cat_features = paddle.concat(cat_features, 1)
+
+        out = cat_features
+        for i in range(3):
+            out = self.convs[("pose", i)](out)
+            if i != 2:
+                out = self.relu(out)
+
+        out = out.mean(3).mean(2)
+
+        out = 0.01 * out.reshape([-1, self.num_frames_to_predict_for, 1, 6])
+
+        axisangle = out[..., :3]
+        translation = out[..., 3:]
+
+        return axisangle, translation
+
+
+class ResnetEncoder(nn.Layer):
+    """Pypaddle module for a resnet encoder
+    """
+    def __init__(self, num_layers, pretrained=False, num_input_images=1):
+        super(ResnetEncoder, self).__init__()
+
+        self.num_ch_enc = np.array([64, 64, 128, 256, 512])
+
+        resnets = {
+            18: paddle.vision.models.resnet18,
+            34: paddle.vision.models.resnet34,
+            50: paddle.vision.models.resnet50,
+            101: paddle.vision.models.resnet101,
+            152: paddle.vision.models.resnet152
+        }
+
+        if num_layers not in resnets:
+            raise ValueError(
+                "{} is not a valid number of resnet layers".format(num_layers))
+
+        if num_input_images > 1:
+            self.encoder = resnet_multiimage_input(num_layers, pretrained,
+                                                   num_input_images)
+        else:
+            self.encoder = resnets[num_layers](pretrained)
+
+        if num_layers > 34:
+            self.num_ch_enc[1:] *= 4
+
+        ######################################
+        # night public first conv
+        ######################################
+        self.conv1 = nn.Conv2D(3,
+                               64,
+                               kernel_size=7,
+                               stride=2,
+                               padding=3,
+                               bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(64)
+        self.relu = nn.ReLU()  # NOTE
+
+        self.conv_shared = nn.Conv2D(512, 64, kernel_size=1)
+
+        ##########################################
+        # private source encoder, day
+        ##########################################
+        self.encoder_day = resnets[num_layers](pretrained)
+        self.conv_diff_day = nn.Conv2D(
+            512, 64, kernel_size=1)  # no bn after conv, so bias=true
+
+        ##########################################
+        # private target encoder, night
+        ##########################################
+        self.encoder_night = resnets[num_layers](pretrained)
+        self.conv_diff_night = nn.Conv2D(512, 64, kernel_size=1)
+
+        ######################################
+        # shared decoder (small decoder), use a simple de-conv to upsample the features with no skip connection
+        ######################################
+        self.convt5 = convt_bn_relu(in_channels=512,
+                                    out_channels=256,
+                                    kernel_size=3,
+                                    stride=2,
+                                    padding=1,
+                                    output_padding=1)
+        self.convt4 = convt_bn_relu(in_channels=256,
+                                    out_channels=128,
+                                    kernel_size=3,
+                                    stride=2,
+                                    padding=1,
+                                    output_padding=1)
+        self.convt3 = convt_bn_relu(in_channels=128,
+                                    out_channels=64,
+                                    kernel_size=3,
+                                    stride=2,
+                                    padding=1,
+                                    output_padding=1)
+        self.convt2 = convt_bn_relu(in_channels=64,
+                                    out_channels=64,
+                                    kernel_size=3,
+                                    stride=2,
+                                    padding=1,
+                                    output_padding=1)
+        self.convt1 = convt_bn_relu(in_channels=64,
+                                    out_channels=64,
+                                    kernel_size=3,
+                                    stride=2,
+                                    padding=1,
+                                    output_padding=1)
+        self.convtf = nn.Conv2D(64, 3, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, input_image, is_night):
+        if self.training:
+            result = []
+            input_data = (input_image - 0.45) / 0.225
+            if is_night == 'day':
+                # source private encoder, day
+                private_feature = self.encoder_day.conv1(input_data)
+                private_feature = self.encoder_day.bn1(private_feature)
+                private_feature = self.encoder_day.relu(private_feature)
+                private_feature = self.encoder_day.maxpool(private_feature)
+                private_feature = self.encoder_day.layer1(private_feature)
+                private_feature = self.encoder_day.layer2(private_feature)
+                private_feature = self.encoder_day.layer3(private_feature)
+                private_feature = self.encoder_day.layer4(private_feature)
+                private_code = self.conv_diff_day(private_feature)
+                private_gram = gram_matrix(private_feature)
+                result.append(private_code)
+                result.append(private_gram)
+
+            elif is_night == 'night':
+                # target private encoder, night
+                private_feature = self.encoder_night.conv1(input_data)
+                private_feature = self.encoder_night.bn1(private_feature)
+                private_feature = self.encoder_night.relu(private_feature)
+                private_feature = self.encoder_night.maxpool(private_feature)
+                private_feature = self.encoder_night.layer1(private_feature)
+                private_feature = self.encoder_night.layer2(private_feature)
+                private_feature = self.encoder_night.layer3(private_feature)
+                private_feature = self.encoder_night.layer4(private_feature)
+                private_code = self.conv_diff_night(private_feature)
+
+                private_gram = gram_matrix(private_feature)
+                result.append(private_code)
+                result.append(private_gram)
+
+        # shared encoder
+        self.features = []
+        x = (input_image - 0.45) / 0.225
+        if is_night == 'day':
+            x = self.encoder.conv1(x)
+            x = self.encoder.bn1(x)
+            self.features.append(self.encoder.relu(x))
+        else:
+            x = self.conv1(x)
+            x = self.bn1(x)
+            self.features.append(self.relu(x))
+
+        self.features.append(
+            self.encoder.layer1(self.encoder.maxpool(self.features[-1])))
+        self.features.append(self.encoder.layer2(self.features[-1]))
+        self.features.append(self.encoder.layer3(self.features[-1]))
+        self.features.append(self.encoder.layer4(self.features[-1]))
+
+        if self.training:
+            shared_code = self.conv_shared(self.features[-1])
+            shared_gram = gram_matrix(self.features[-1])
+            result.append(shared_code)  # use this to calculate loss of diff
+            result.append(shared_gram)
+            result.append(
+                self.features[-1])  # use this to calculate loss of similarity
+
+            union_code = private_feature + self.features[-1]
+            rec_code = self.convt5(union_code)
+            rec_code = self.convt4(rec_code)
+            rec_code = self.convt3(rec_code)
+            rec_code = self.convt2(rec_code)
+            rec_code = self.convt1(rec_code)
+            rec_code = self.convtf(rec_code)
+            result.append(rec_code)
+
+            return self.features, result
+        else:
+            return self.features
+
+
+class ResnetEncoder_pose(nn.Layer):
+    """Pypaddle module for a resnet encoder
+    """
+    def __init__(self, num_layers, pretrained=False, num_input_images=1):
+        super(ResnetEncoder_pose, self).__init__()
+
+        self.num_ch_enc = np.array([64, 64, 128, 256, 512])
+        resnets = {
+            18: paddle.vision.models.resnet18,
+            34: paddle.vision.models.resnet34,
+            50: paddle.vision.models.resnet50,
+            101: paddle.vision.models.resnet101,
+            152: paddle.vision.models.resnet152
+        }
+
+        if num_layers not in resnets:
+            raise ValueError(
+                "{} is not a valid number of resnet layers".format(num_layers))
+
+        if num_input_images > 1:
+            self.encoder = resnet_multiimage_input(num_layers, num_input_images)
+        else:
+            self.encoder = resnets[num_layers](pretrained)
+
+        if num_layers > 34:
+            self.num_ch_enc[1:] *= 4
+
+    def forward(self, input_image):
+        features = []
+        x = (input_image - 0.45) / 0.225
+        x = self.encoder.conv1(x)
+        x = self.encoder.bn1(x)
+        features.append(self.encoder.relu(x))
+        features.append(self.encoder.layer1(self.encoder.maxpool(features[-1])))
+        features.append(self.encoder.layer2(features[-1]))
+        features.append(self.encoder.layer3(features[-1]))
+        features.append(self.encoder.layer4(features[-1]))
+
+        return features
+
+
+@BACKBONES.register()
+class ADDS_DepthNet(nn.Layer):
+    def __init__(self,
+                 num_layers=18,
+                 frame_ids=[0, -1, 1],
+                 height=256,
+                 width=512,
+                 batch_size=6,
+                 pose_model_input="pairs",
+                 use_stereo=False,
+                 only_depth_encoder=False,
+                 pretrained=None,
+                 scales=[0, 1, 2, 3],
+                 min_depth=0.1,
+                 max_depth=100.0,
+                 pose_model_type='separate_resnet',
+                 v1_multiscale=False,
+                 predictive_mask=False,
+                 disable_automasking=False):
+        super(ADDS_DepthNet, self).__init__()
+        self.num_layers = num_layers
+        self.height = height
+        self.width = width
+        self.batch_size = batch_size
+        self.frame_ids = frame_ids
+        self.pose_model_input = pose_model_input
+        self.use_stereo = use_stereo
+        self.only_depth_encoder = only_depth_encoder
+        self.pretrained = pretrained
+        self.scales = scales
+        self.pose_model_type = pose_model_type
+        self.predictive_mask = predictive_mask
+        self.disable_automasking = disable_automasking
+        self.v1_multiscale = v1_multiscale
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+
+        self.num_input_frames = len(self.frame_ids)
+        self.num_pose_frames = 2 if self.pose_model_input == "pairs" else self.num_input_frames
+
+        assert self.frame_ids[0] == 0, "frame_ids must start with 0"
+
+        self.use_pose_net = not (self.use_stereo and self.frame_ids == [0])
+
+        self.encoder = ResnetEncoder(self.num_layers)
+        if not self.only_depth_encoder:
+            self.depth = DepthDecoder(self.encoder.num_ch_enc, self.scales)
+        if self.use_pose_net and not self.only_depth_encoder:
+            if self.pose_model_type == "separate_resnet":
+                self.pose_encoder = ResnetEncoder_pose(
+                    self.num_layers, num_input_images=self.num_pose_frames)
+                self.pose = PoseDecoder(self.pose_encoder.num_ch_enc,
+                                        num_input_features=1,
+                                        num_frames_to_predict_for=2)
+
+        self.backproject_depth = {}
+        self.project_3d = {}
+        for scale in self.scales:
+            h = self.height // (2**scale)
+            w = self.width // (2**scale)
+
+            self.backproject_depth[scale] = BackprojectDepth(
+                self.batch_size, h, w)
+            self.project_3d[scale] = Project3D(batch_size, h, w)
+
+    def init_weights(self):
+        """First init model's weight"""
+        for m in self.sublayers(include_self=True):
+            if isinstance(m, nn.Conv2D):
+                kaiming_normal_(m.weight, a=math.sqrt(5))
+                if m.bias is not None:
+                    fan_in, _ = _calculate_fan_in_and_fan_out(m.weight)
+                    bound = 1 / math.sqrt(fan_in)
+                    uniform_ = paddle.nn.initializer.Uniform(-bound, bound)
+                    uniform_(m.bias)
+        """Second, if provide pretrained ckpt, load it"""
+        if self.pretrained:  # load pretrained weights
+            load_ckpt(self, self.pretrained)
+
+    def forward(self, inputs, day_or_night='day'):
+        if self.training:
+            features, result = self.encoder(inputs["color_aug", 0, 0], 'day')
+            features_night, result_night = self.encoder(
+                inputs[("color_n_aug", 0, 0)], 'night')
+
+            outputs = self.depth(features)
+            outputs_night = self.depth(features_night)
+            if self.use_pose_net and not self.only_depth_encoder:
+                outputs.update(self.predict_poses(inputs, 'day'))
+                outputs_night.update(self.predict_poses(inputs, 'night'))
+
+                self.generate_images_pred(inputs, outputs, 'day')
+                self.generate_images_pred(inputs, outputs_night, 'night')
+
+            outputs['frame_ids'] = self.frame_ids
+            outputs['scales'] = self.scales
+            outputs['result'] = result
+            outputs['result_night'] = result_night
+            outputs_night['frame_ids'] = self.frame_ids
+            outputs_night['scales'] = self.scales
+            outputs['outputs_night'] = outputs_night
+        else:
+            if isinstance(inputs, dict):
+                input_color = inputs[("color", 0, 0)]
+                features = self.encoder(input_color, day_or_night[0])
+                outputs = self.depth(features)
+
+                pred_disp, _ = disp_to_depth(outputs[("disp", 0)],
+                                             self.min_depth, self.max_depth)
+
+                pred_disp = pred_disp[:, 0].numpy()
+
+                outputs['pred_disp'] = np.squeeze(pred_disp)
+
+                outputs['gt'] = np.squeeze(inputs['depth_gt'].numpy())
+            else:
+                input_color = inputs
+                features = self.encoder(input_color, day_or_night)
+                outputs = self.depth(features)
+
+                pred_disp, _ = disp_to_depth(outputs[("disp", 0)],
+                                             self.min_depth, self.max_depth)
+
+                pred_disp = pred_disp[:, 0]
+                outputs = paddle.squeeze(pred_disp)
+        return outputs
+
+    def predict_poses(self, inputs, is_night):
+        """Predict poses between input frames for monocular sequences.
+        """
+        outputs = {}
+        if self.num_pose_frames == 2:
+            if is_night:
+                pose_feats = {
+                    f_i: inputs["color_n_aug", f_i, 0]
+                    for f_i in self.frame_ids
+                }
+            else:
+                pose_feats = {
+                    f_i: inputs["color_aug", f_i, 0]
+                    for f_i in self.frame_ids
+                }
+
+            for f_i in self.frame_ids[1:]:
+                if f_i != "s":
+                    if f_i < 0:
+                        pose_inputs = [pose_feats[f_i], pose_feats[0]]
+                    else:
+                        pose_inputs = [pose_feats[0], pose_feats[f_i]]
+
+                    if self.pose_model_type == "separate_resnet":
+                        pose_inputs = [
+                            self.pose_encoder(paddle.concat(pose_inputs,
+                                                            axis=1))
+                        ]
+
+                    axisangle, translation = self.pose(pose_inputs)
+                    outputs[("axisangle", 0, f_i)] = axisangle
+                    outputs[("translation", 0, f_i)] = translation
+
+                    # Invert the matrix if the frame id is negative
+                    outputs[("cam_T_cam", 0,
+                             f_i)] = transformation_from_parameters(
+                                 axisangle[:, 0],
+                                 translation[:, 0],
+                                 invert=(f_i < 0))
+            return outputs
+
+    def generate_images_pred(self, inputs, outputs, is_night):
+        """Generate the warped (reprojected) color images for a minibatch.
+        Generated images are saved into the `outputs` dictionary.
+        """
+        _, _, height, width = inputs['color', 0, 0].shape
+        for scale in self.scales:
+            disp = outputs[("disp", scale)]
+            if self.v1_multiscale:
+                source_scale = scale
+            else:
+                disp = F.interpolate(disp, [height, width],
+                                     mode="bilinear",
+                                     align_corners=False)
+                source_scale = 0
+
+            _, depth = disp_to_depth(disp, self.min_depth, self.max_depth)
+
+            outputs[("depth", 0, scale)] = depth
+            for i, frame_id in enumerate(self.frame_ids[1:]):
+
+                T = outputs[("cam_T_cam", 0, frame_id)]
+
+                cam_points = self.backproject_depth[source_scale](
+                    depth, inputs[("inv_K", source_scale)])
+                pix_coords = self.project_3d[source_scale](
+                    cam_points, inputs[("K", source_scale)], T)
+
+                outputs[("sample", frame_id, scale)] = pix_coords
+
+                if is_night:
+                    inputs[("color_n", frame_id,
+                            source_scale)].stop_gradient = False
+                    outputs[("color", frame_id,
+                             scale)] = paddle.nn.functional.grid_sample(
+                                 inputs[("color_n", frame_id, source_scale)],
+                                 outputs[("sample", frame_id, scale)],
+                                 padding_mode="border",
+                                 align_corners=False)
+
+                else:
+                    inputs[("color", frame_id,
+                            source_scale)].stop_gradient = False
+                    outputs[("color", frame_id,
+                             scale)] = paddle.nn.functional.grid_sample(
+                                 inputs[("color", frame_id, source_scale)],
+                                 outputs[("sample", frame_id, scale)],
+                                 padding_mode="border",
+                                 align_corners=False)
+
+                if not self.disable_automasking:
+                    if is_night:
+                        outputs[("color_identity", frame_id, scale)] = \
+                            inputs[("color_n", frame_id, source_scale)]
+                    else:
+                        outputs[("color_identity", frame_id, scale)] = \
+                            inputs[("color", frame_id, source_scale)]
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/agcn.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/agcn.py
new file mode 100644
index 0000000..9f870c6
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/agcn.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ..registry import BACKBONES
+
+
+class GCN(nn.Layer):
+    def __init__(self, in_channels, out_channels, vertex_nums=25, stride=1):
+        super(GCN, self).__init__()
+        self.conv1 = nn.Conv2D(in_channels=in_channels,
+                               out_channels=3 * out_channels,
+                               kernel_size=1,
+                               stride=1)
+        self.conv2 = nn.Conv2D(in_channels=vertex_nums * 3,
+                               out_channels=vertex_nums,
+                               kernel_size=1)
+
+    def forward(self, x):
+        # x --- N,C,T,V
+        x = self.conv1(x)  # N,3C,T,V
+        N, C, T, V = x.shape
+        x = paddle.reshape(x, [N, C // 3, 3, T, V])  # N,C,3,T,V
+        x = paddle.transpose(x, perm=[0, 1, 2, 4, 3])  # N,C,3,V,T
+        x = paddle.reshape(x, [N, C // 3, 3 * V, T])  # N,C,3V,T
+        x = paddle.transpose(x, perm=[0, 2, 1, 3])  # N,3V,C,T
+        x = self.conv2(x)  # N,V,C,T
+        x = paddle.transpose(x, perm=[0, 2, 3, 1])  # N,C,T,V
+        return x
+
+
+class Block(paddle.nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 vertex_nums=25,
+                 temporal_size=9,
+                 stride=1,
+                 residual=True):
+        super(Block, self).__init__()
+        self.residual = residual
+        self.out_channels = out_channels
+
+        self.bn_res = nn.BatchNorm2D(out_channels)
+        self.conv_res = nn.Conv2D(in_channels=in_channels,
+                                  out_channels=out_channels,
+                                  kernel_size=1,
+                                  stride=(stride, 1))
+        self.gcn = GCN(in_channels=in_channels,
+                       out_channels=out_channels,
+                       vertex_nums=vertex_nums)
+        self.tcn = nn.Sequential(
+            nn.BatchNorm2D(out_channels),
+            nn.ReLU(),
+            nn.Conv2D(in_channels=out_channels,
+                      out_channels=out_channels,
+                      kernel_size=(temporal_size, 1),
+                      padding=((temporal_size - 1) // 2, 0),
+                      stride=(stride, 1)),
+            nn.BatchNorm2D(out_channels),
+        )
+
+    def forward(self, x):
+        if self.residual:
+            y = self.conv_res(x)
+            y = self.bn_res(y)
+        x = self.gcn(x)
+        x = self.tcn(x)
+        out = x + y if self.residual else x
+        out = F.relu(out)
+        return out
+
+
+@BACKBONES.register()
+class AGCN(nn.Layer):
+    """
+    AGCN model improves the performance of ST-GCN using
+    Adaptive Graph Convolutional Networks.
+    Args:
+        in_channels: int, channels of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default 2.
+    """
+    def __init__(self, in_channels=2, **kwargs):
+        super(AGCN, self).__init__()
+
+        self.data_bn = nn.BatchNorm1D(25 * 2)
+        self.agcn = nn.Sequential(
+            Block(in_channels=in_channels,
+                  out_channels=64,
+                  residual=False,
+                  **kwargs), Block(in_channels=64, out_channels=64, **kwargs),
+            Block(in_channels=64, out_channels=64, **kwargs),
+            Block(in_channels=64, out_channels=64, **kwargs),
+            Block(in_channels=64, out_channels=128, stride=2, **kwargs),
+            Block(in_channels=128, out_channels=128, **kwargs),
+            Block(in_channels=128, out_channels=128, **kwargs),
+            Block(in_channels=128, out_channels=256, stride=2, **kwargs),
+            Block(in_channels=256, out_channels=256, **kwargs),
+            Block(in_channels=256, out_channels=256, **kwargs))
+
+        self.pool = nn.AdaptiveAvgPool2D(output_size=(1, 1))
+
+    def forward(self, x):
+        # data normalization
+        N, C, T, V, M = x.shape
+
+        x = x.transpose((0, 4, 1, 2, 3))  # N, M, C, T, V
+        x = x.reshape((N * M, C, T, V))
+
+        x = self.agcn(x)
+
+        x = self.pool(x)  # NM,C,T,V --> NM,C,1,1
+        C = x.shape[1]
+        x = paddle.reshape(x, (N, M, C, 1, 1)).mean(axis=1)  # N,C,1,1
+
+        return x
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/agcn2s.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/agcn2s.py
new file mode 100644
index 0000000..a630c68
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/agcn2s.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import numpy as np
+from ..registry import BACKBONES
+
+
+def import_class(name):
+    components = name.split('.')
+    mod = __import__(components[0])
+    for comp in components[1:]:
+        mod = getattr(mod, comp)
+    return mod
+
+
+class UnitTCN(nn.Layer):
+    def __init__(self, in_channels, out_channels, kernel_size=9, stride=1):
+        super(UnitTCN, self).__init__()
+        pad = int((kernel_size - 1) / 2)
+        self.conv = nn.Conv2D(in_channels,
+                              out_channels,
+                              kernel_size=(kernel_size, 1),
+                              padding=(pad, 0),
+                              stride=(stride, 1))
+
+        self.bn = nn.BatchNorm2D(out_channels)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        " input size : (N*M, C, T, V)"
+        x = self.bn(self.conv(x))
+        return x
+
+
+class UnitGCN(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 A,
+                 coff_embedding=4,
+                 num_subset=3):
+        super(UnitGCN, self).__init__()
+        inter_channels = out_channels // coff_embedding
+        self.inter_c = inter_channels
+        PA = self.create_parameter(shape=A.shape, dtype='float32')
+        self.PA = PA
+        self.A = paddle.to_tensor(A.astype(np.float32))
+        self.num_subset = num_subset
+
+        self.conv_a = nn.LayerList()
+        self.conv_b = nn.LayerList()
+        self.conv_d = nn.LayerList()
+        for i in range(self.num_subset):
+            self.conv_a.append(nn.Conv2D(in_channels, inter_channels, 1))
+            self.conv_b.append(nn.Conv2D(in_channels, inter_channels, 1))
+            self.conv_d.append(nn.Conv2D(in_channels, out_channels, 1))
+
+        if in_channels != out_channels:
+            self.down = nn.Sequential(nn.Conv2D(in_channels, out_channels, 1),
+                                      nn.BatchNorm2D(out_channels))
+        else:
+            self.down = lambda x: x
+
+        self.bn = nn.BatchNorm2D(out_channels)
+        self.soft = nn.Softmax(-2)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        N, C, T, V = x.shape
+        A = self.A + self.PA
+
+        y = None
+        for i in range(self.num_subset):
+            A1 = paddle.transpose(self.conv_a[i](x),
+                                  perm=[0, 3, 1,
+                                        2]).reshape([N, V, self.inter_c * T])
+            A2 = self.conv_b[i](x).reshape([N, self.inter_c * T, V])
+            A1 = self.soft(paddle.matmul(A1, A2) / A1.shape[-1])
+            A1 = A1 + A[i]
+            A2 = x.reshape([N, C * T, V])
+            z = self.conv_d[i](paddle.matmul(A2, A1).reshape([N, C, T, V]))
+            y = z + y if y is not None else z
+
+        y = self.bn(y)
+        y += self.down(x)
+        return self.relu(y)
+
+
+class Block(nn.Layer):
+    def __init__(self, in_channels, out_channels, A, stride=1, residual=True):
+        super(Block, self).__init__()
+        self.gcn1 = UnitGCN(in_channels, out_channels, A)
+        self.tcn1 = UnitTCN(out_channels, out_channels, stride=stride)
+        self.relu = nn.ReLU()
+        if not residual:
+            self.residual = lambda x: 0
+
+        elif (in_channels == out_channels) and (stride == 1):
+            self.residual = lambda x: x
+
+        else:
+            self.residual = UnitTCN(in_channels,
+                                    out_channels,
+                                    kernel_size=1,
+                                    stride=stride)
+
+    def forward(self, x):
+        x = self.tcn1(self.gcn1(x)) + self.residual(x)
+        return self.relu(x)
+
+
+# This Graph structure is for the NTURGB+D dataset. If you use a custom dataset, modify num_node and the corresponding graph adjacency structure.
+class Graph:
+    def __init__(self, labeling_mode='spatial'):
+        num_node = 25
+        self_link = [(i, i) for i in range(num_node)]
+        inward_ori_index = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5),
+                            (7, 6), (8, 7), (9, 21), (10, 9), (11, 10),
+                            (12, 11), (13, 1), (14, 13), (15, 14), (16, 15),
+                            (17, 1), (18, 17), (19, 18), (20, 19), (22, 23),
+                            (23, 8), (24, 25), (25, 12)]
+        inward = [(i - 1, j - 1) for (i, j) in inward_ori_index]
+        outward = [(j, i) for (i, j) in inward]
+        neighbor = inward + outward
+
+        self.num_node = num_node
+        self.self_link = self_link
+        self.inward = inward
+        self.outward = outward
+        self.neighbor = neighbor
+        self.A = self.get_adjacency_matrix(labeling_mode)
+
+    def edge2mat(self, link, num_node):
+        A = np.zeros((num_node, num_node))
+        for i, j in link:
+            A[j, i] = 1
+        return A
+
+    def normalize_digraph(self, A):
+        Dl = np.sum(A, 0)
+        h, w = A.shape
+        Dn = np.zeros((w, w))
+        for i in range(w):
+            if Dl[i] > 0:
+                Dn[i, i] = Dl[i]**(-1)
+        AD = np.dot(A, Dn)
+        return AD
+
+    def get_spatial_graph(self, num_node, self_link, inward, outward):
+        I = self.edge2mat(self_link, num_node)
+        In = self.normalize_digraph(self.edge2mat(inward, num_node))
+        Out = self.normalize_digraph(self.edge2mat(outward, num_node))
+        A = np.stack((I, In, Out))
+        return A
+
+    def get_adjacency_matrix(self, labeling_mode=None):
+        if labeling_mode is None:
+            return self.A
+        if labeling_mode == 'spatial':
+            A = self.get_spatial_graph(self.num_node, self.self_link,
+                                       self.inward, self.outward)
+        else:
+            raise ValueError()
+        return A
+
+
+@BACKBONES.register()
+class AGCN2s(nn.Layer):
+    def __init__(self,
+                 num_point=25,
+                 num_person=2,
+                 graph='ntu_rgb_d',
+                 graph_args=dict(),
+                 in_channels=3):
+        super(AGCN2s, self).__init__()
+
+        if graph == 'ntu_rgb_d':
+            self.graph = Graph(**graph_args)
+        else:
+            raise ValueError()
+
+        A = self.graph.A
+        self.data_bn = nn.BatchNorm1D(num_person * in_channels * num_point)
+
+        self.l1 = Block(in_channels, 64, A, residual=False)
+        self.l2 = Block(64, 64, A)
+        self.l3 = Block(64, 64, A)
+        self.l4 = Block(64, 64, A)
+        self.l5 = Block(64, 128, A, stride=2)
+        self.l6 = Block(128, 128, A)
+        self.l7 = Block(128, 128, A)
+        self.l8 = Block(128, 256, A, stride=2)
+        self.l9 = Block(256, 256, A)
+        self.l10 = Block(256, 256, A)
+
+    def forward(self, x):
+        N, C, T, V, M = x.shape
+
+        x = x.transpose([0, 4, 3, 1, 2]).reshape_([N, M * V * C, T])
+        x = self.data_bn(x)
+        x = x.reshape_([N, M, V, C,
+                        T]).transpose([0, 1, 3, 4,
+                                       2]).reshape_([N * M, C, T, V])
+
+        x = self.l1(x)
+        x = self.l2(x)
+        x = self.l3(x)
+        x = self.l4(x)
+        x = self.l5(x)
+        x = self.l6(x)
+        x = self.l7(x)
+        x = self.l8(x)
+        x = self.l9(x)
+        x = self.l10(x)
+
+        return x
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/asrf.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/asrf.py
new file mode 100644
index 0000000..37437b3
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/asrf.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# https://github.com/yabufarha/ms-tcn/blob/master/model.py
+# https://github.com/yiskw713/asrf/libs/models/tcn.py
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+import copy
+import random
+import math
+
+from paddle import ParamAttr
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from .ms_tcn import DilatedResidualLayer
+from ..framework.segmenters.utils import init_bias, KaimingUniform_like_torch
+
+
+@BACKBONES.register()
+class ASRF(nn.Layer):
+
+    def __init__(self, in_channel, num_features, num_classes, num_stages,
+                 num_layers):
+        super().__init__()
+        self.in_channel = in_channel
+        self.num_features = num_features
+        self.num_classes = num_classes
+        self.num_stages = num_stages
+        self.num_layers = num_layers
+
+        # define layers
+        self.conv_in = nn.Conv1D(self.in_channel, self.num_features, 1)
+
+        shared_layers = [
+            DilatedResidualLayer(2**i, self.num_features, self.num_features)
+            for i in range(self.num_layers)
+        ]
+        self.shared_layers = nn.LayerList(shared_layers)
+
+        self.init_weights()
+
+    def init_weights(self):
+        """
+        initialize model layers' weight
+        """
+        # init weight
+        for layer in self.sublayers():
+            if isinstance(layer, nn.Conv1D):
+                layer.weight.set_value(
+                    KaimingUniform_like_torch(layer.weight).astype('float32'))
+                if layer.bias is not None:
+                    layer.bias.set_value(
+                        init_bias(layer.weight, layer.bias).astype('float32'))
+
+    def forward(self, x):
+        """ ASRF forward
+        """
+        out = self.conv_in(x)
+        for layer in self.shared_layers:
+            out = layer(out)
+        return out
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/bmn.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/bmn.py
new file mode 100644
index 0000000..200d192
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/bmn.py
@@ -0,0 +1,290 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import numpy as np
+import paddle
+from paddle import ParamAttr
+from ..registry import BACKBONES
+
+
+def _get_interp1d_bin_mask(seg_xmin, seg_xmax, tscale, num_sample,
+                           num_sample_perbin):
+    """ generate sample mask for a boundary-matching pair """
+    plen = float(seg_xmax - seg_xmin)
+    plen_sample = plen / (num_sample * num_sample_perbin - 1.0)
+    total_samples = [
+        seg_xmin + plen_sample * ii
+        for ii in range(num_sample * num_sample_perbin)
+    ]
+    p_mask = []
+    for idx in range(num_sample):
+        bin_samples = total_samples[idx * num_sample_perbin:(idx + 1) *
+                                    num_sample_perbin]
+        bin_vector = np.zeros([tscale])
+        for sample in bin_samples:
+            sample_upper = math.ceil(sample)
+            sample_decimal, sample_down = math.modf(sample)
+            if (tscale - 1) >= int(sample_down) >= 0:
+                bin_vector[int(sample_down)] += 1 - sample_decimal
+            if (tscale - 1) >= int(sample_upper) >= 0:
+                bin_vector[int(sample_upper)] += sample_decimal
+        bin_vector = 1.0 / num_sample_perbin * bin_vector
+        p_mask.append(bin_vector)
+    p_mask = np.stack(p_mask, axis=1)
+    return p_mask
+
+
+def get_interp1d_mask(tscale, dscale, prop_boundary_ratio, num_sample,
+                      num_sample_perbin):
+    """ generate sample mask for each point in Boundary-Matching Map """
+    mask_mat = []
+    for start_index in range(tscale):
+        mask_mat_vector = []
+        for duration_index in range(dscale):
+            if start_index + duration_index < tscale:
+                p_xmin = start_index
+                p_xmax = start_index + duration_index
+                center_len = float(p_xmax - p_xmin) + 1
+                sample_xmin = p_xmin - center_len * prop_boundary_ratio
+                sample_xmax = p_xmax + center_len * prop_boundary_ratio
+                p_mask = _get_interp1d_bin_mask(sample_xmin, sample_xmax,
+                                                tscale, num_sample,
+                                                num_sample_perbin)
+            else:
+                p_mask = np.zeros([tscale, num_sample])
+            mask_mat_vector.append(p_mask)
+        mask_mat_vector = np.stack(mask_mat_vector, axis=2)
+        mask_mat.append(mask_mat_vector)
+    mask_mat = np.stack(mask_mat, axis=3)
+    mask_mat = mask_mat.astype(np.float32)
+
+    sample_mask = np.reshape(mask_mat, [tscale, -1])
+    return sample_mask
+
+
+def init_params(name, in_channels, kernel_size):
+    fan_in = in_channels * kernel_size * 1
+    k = 1. / math.sqrt(fan_in)
+    param_attr = ParamAttr(name=name,
+                           initializer=paddle.nn.initializer.Uniform(low=-k,
+                                                                     high=k))
+    return param_attr
+
+
+@BACKBONES.register()
+class BMN(paddle.nn.Layer):
+    """BMN model from
+    `"BMN: Boundary-Matching Network for Temporal Action Proposal Generation" <https://arxiv.org/abs/1907.09702>`_
+    Args:
+        tscale (int): sequence length, default 100.
+        dscale (int): max duration length, default 100.
+        prop_boundary_ratio (float): ratio of expanded temporal region in proposal boundary, default 0.5.
+        num_sample (int): number of samples betweent starting boundary and ending boundary of each propoasl, default 32.
+        num_sample_perbin (int):  number of selected points in each sample, default 3.
+    """
+
+    def __init__(
+        self,
+        tscale,
+        dscale,
+        prop_boundary_ratio,
+        num_sample,
+        num_sample_perbin,
+        feat_dim=400,
+    ):
+        super(BMN, self).__init__()
+
+        #init config
+        self.feat_dim = feat_dim
+        self.tscale = tscale
+        self.dscale = dscale
+        self.prop_boundary_ratio = prop_boundary_ratio
+        self.num_sample = num_sample
+        self.num_sample_perbin = num_sample_perbin
+
+        self.hidden_dim_1d = 256
+        self.hidden_dim_2d = 128
+        self.hidden_dim_3d = 512
+
+        # Base Module
+        self.b_conv1 = paddle.nn.Conv1D(
+            in_channels=self.feat_dim,
+            out_channels=self.hidden_dim_1d,
+            kernel_size=3,
+            padding=1,
+            groups=4,
+            weight_attr=init_params('Base_1_w', self.feat_dim, 3),
+            bias_attr=init_params('Base_1_b', self.feat_dim, 3))
+        self.b_conv1_act = paddle.nn.ReLU()
+
+        self.b_conv2 = paddle.nn.Conv1D(
+            in_channels=self.hidden_dim_1d,
+            out_channels=self.hidden_dim_1d,
+            kernel_size=3,
+            padding=1,
+            groups=4,
+            weight_attr=init_params('Base_2_w', self.hidden_dim_1d, 3),
+            bias_attr=init_params('Base_2_b', self.hidden_dim_1d, 3))
+        self.b_conv2_act = paddle.nn.ReLU()
+
+        # Temporal Evaluation Module
+        self.ts_conv1 = paddle.nn.Conv1D(
+            in_channels=self.hidden_dim_1d,
+            out_channels=self.hidden_dim_1d,
+            kernel_size=3,
+            padding=1,
+            groups=4,
+            weight_attr=init_params('TEM_s1_w', self.hidden_dim_1d, 3),
+            bias_attr=init_params('TEM_s1_b', self.hidden_dim_1d, 3))
+        self.ts_conv1_act = paddle.nn.ReLU()
+
+        self.ts_conv2 = paddle.nn.Conv1D(
+            in_channels=self.hidden_dim_1d,
+            out_channels=1,
+            kernel_size=1,
+            padding=0,
+            groups=1,
+            weight_attr=init_params('TEM_s2_w', self.hidden_dim_1d, 1),
+            bias_attr=init_params('TEM_s2_b', self.hidden_dim_1d, 1))
+        self.ts_conv2_act = paddle.nn.Sigmoid()
+
+        self.te_conv1 = paddle.nn.Conv1D(
+            in_channels=self.hidden_dim_1d,
+            out_channels=self.hidden_dim_1d,
+            kernel_size=3,
+            padding=1,
+            groups=4,
+            weight_attr=init_params('TEM_e1_w', self.hidden_dim_1d, 3),
+            bias_attr=init_params('TEM_e1_b', self.hidden_dim_1d, 3))
+        self.te_conv1_act = paddle.nn.ReLU()
+        self.te_conv2 = paddle.nn.Conv1D(
+            in_channels=self.hidden_dim_1d,
+            out_channels=1,
+            kernel_size=1,
+            padding=0,
+            groups=1,
+            weight_attr=init_params('TEM_e2_w', self.hidden_dim_1d, 1),
+            bias_attr=init_params('TEM_e2_b', self.hidden_dim_1d, 1))
+        self.te_conv2_act = paddle.nn.Sigmoid()
+
+        #Proposal Evaluation Module
+        self.p_conv1 = paddle.nn.Conv1D(
+            in_channels=self.hidden_dim_1d,
+            out_channels=self.hidden_dim_2d,
+            kernel_size=3,
+            padding=1,
+            groups=1,
+            weight_attr=init_params('PEM_1d_w', self.hidden_dim_1d, 3),
+            bias_attr=init_params('PEM_1d_b', self.hidden_dim_1d, 3))
+        self.p_conv1_act = paddle.nn.ReLU()
+
+        # init to speed up
+        sample_mask = get_interp1d_mask(self.tscale, self.dscale,
+                                        self.prop_boundary_ratio,
+                                        self.num_sample, self.num_sample_perbin)
+        self.sample_mask = paddle.to_tensor(sample_mask)
+        self.sample_mask.stop_gradient = True
+
+        self.p_conv3d1 = paddle.nn.Conv3D(
+            in_channels=128,
+            out_channels=self.hidden_dim_3d,
+            kernel_size=(self.num_sample, 1, 1),
+            stride=(self.num_sample, 1, 1),
+            padding=0,
+            weight_attr=ParamAttr(name="PEM_3d1_w"),
+            bias_attr=ParamAttr(name="PEM_3d1_b"))
+        self.p_conv3d1_act = paddle.nn.ReLU()
+
+        self.p_conv2d1 = paddle.nn.Conv2D(
+            in_channels=512,
+            out_channels=self.hidden_dim_2d,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(name="PEM_2d1_w"),
+            bias_attr=ParamAttr(name="PEM_2d1_b"))
+        self.p_conv2d1_act = paddle.nn.ReLU()
+
+        self.p_conv2d2 = paddle.nn.Conv2D(
+            in_channels=128,
+            out_channels=self.hidden_dim_2d,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            weight_attr=ParamAttr(name="PEM_2d2_w"),
+            bias_attr=ParamAttr(name="PEM_2d2_b"))
+        self.p_conv2d2_act = paddle.nn.ReLU()
+
+        self.p_conv2d3 = paddle.nn.Conv2D(
+            in_channels=128,
+            out_channels=self.hidden_dim_2d,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            weight_attr=ParamAttr(name="PEM_2d3_w"),
+            bias_attr=ParamAttr(name="PEM_2d3_b"))
+        self.p_conv2d3_act = paddle.nn.ReLU()
+
+        self.p_conv2d4 = paddle.nn.Conv2D(
+            in_channels=128,
+            out_channels=2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(name="PEM_2d4_w"),
+            bias_attr=ParamAttr(name="PEM_2d4_b"))
+        self.p_conv2d4_act = paddle.nn.Sigmoid()
+
+    def init_weights(self):
+        pass
+
+    def forward(self, x):
+        #Base Module
+        x = self.b_conv1(x)
+        x = self.b_conv1_act(x)
+        x = self.b_conv2(x)
+        x = self.b_conv2_act(x)
+
+        #TEM
+        xs = self.ts_conv1(x)
+        xs = self.ts_conv1_act(xs)
+        xs = self.ts_conv2(xs)
+        xs = self.ts_conv2_act(xs)
+        xs = paddle.squeeze(xs, axis=[1])
+        xe = self.te_conv1(x)
+        xe = self.te_conv1_act(xe)
+        xe = self.te_conv2(xe)
+        xe = self.te_conv2_act(xe)
+        xe = paddle.squeeze(xe, axis=[1])
+
+        #PEM
+        xp = self.p_conv1(x)
+        xp = self.p_conv1_act(xp)
+        #BM layer
+        xp = paddle.matmul(xp, self.sample_mask)
+        xp = paddle.reshape(xp, shape=[0, 0, -1, self.dscale, self.tscale])
+
+        xp = self.p_conv3d1(xp)
+        xp = self.p_conv3d1_act(xp)
+        xp = paddle.squeeze(xp, axis=[2])
+        xp = self.p_conv2d1(xp)
+        xp = self.p_conv2d1_act(xp)
+        xp = self.p_conv2d2(xp)
+        xp = self.p_conv2d2_act(xp)
+        xp = self.p_conv2d3(xp)
+        xp = self.p_conv2d3_act(xp)
+        xp = self.p_conv2d4(xp)
+        xp = self.p_conv2d4_act(xp)
+        return xp, xs, xe
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/cfbi.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/cfbi.py
new file mode 100644
index 0000000..5fbf044
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/cfbi.py
@@ -0,0 +1,88 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ..registry import BACKBONES
+from .deeplab import DeepLab
+
+
+class FPN(nn.Layer):
+    """FPN Layer"""
+    def __init__(self, in_dim_4x, in_dim_8x, in_dim_16x, out_dim):
+        super(FPN, self).__init__()
+        self.toplayer = self._make_layer(in_dim_16x, out_dim)
+        self.latlayer1 = self._make_layer(in_dim_8x, out_dim)
+        self.latlayer2 = self._make_layer(in_dim_4x, out_dim)
+
+        self.smooth1 = self._make_layer(out_dim,
+                                        out_dim,
+                                        kernel_size=3,
+                                        padding=1)
+        self.smooth2 = self._make_layer(out_dim,
+                                        out_dim,
+                                        kernel_size=3,
+                                        padding=1)
+
+    def _make_layer(self, in_dim, out_dim, kernel_size=1, padding=0):
+        return nn.Sequential(
+            nn.Conv2D(in_dim,
+                      out_dim,
+                      kernel_size=kernel_size,
+                      stride=1,
+                      padding=padding,
+                      bias_attr=False),
+            nn.GroupNorm(num_groups=32, num_channels=out_dim))
+
+    def forward(self, x_4x, x_8x, x_16x):
+        """ forward function"""
+        x_16x = self.toplayer(x_16x)
+        x_8x = self.latlayer1(x_8x)
+        x_4x = self.latlayer2(x_4x)
+
+        x_8x = x_8x + F.interpolate(
+            x_16x, size=x_8x.shape[-2:], mode='bilinear', align_corners=True)
+        x_4x = x_4x + F.interpolate(
+            x_8x, size=x_4x.shape[-2:], mode='bilinear', align_corners=True)
+
+        x_8x = self.smooth1(x_8x)
+        x_4x = self.smooth2(x_4x)
+
+        return F.relu(x_4x), F.relu(x_8x), F.relu(x_16x)
+
+
+@BACKBONES.register()
+class CFBI(nn.Layer):
+    """CFBI plus backbone"""
+    def __init__(self,
+                 backbone='resnet',
+                 freeze_bn=True,
+                 model_aspp_outdim=256,
+                 in_dim_8x=512,
+                 model_semantic_embedding_dim=256):  #,epsilon=1e-05):
+        super(CFBI, self).__init__()
+        #self.epsilon = epsilon
+        self.feature_extracter = DeepLab(backbone=backbone, freeze_bn=freeze_bn)
+        self.fpn = FPN(in_dim_4x=model_aspp_outdim,
+                       in_dim_8x=in_dim_8x,
+                       in_dim_16x=model_aspp_outdim,
+                       out_dim=model_semantic_embedding_dim)
+
+    def forward(self, x):
+        """forward function"""
+        x, aspp_x, low_level, mid_level = self.feature_extracter(x, True)
+        x_4x, x_8x, x_16x = self.fpn(x, mid_level, aspp_x)
+        return x_4x, x_8x, x_16x, low_level
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/ctrgcn.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/ctrgcn.py
new file mode 100644
index 0000000..9d645f4
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/ctrgcn.py
@@ -0,0 +1,514 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+
+
+def conv_init(conv):
+    if conv.weight is not None:
+        weight_init_(conv.weight, 'kaiming_normal_', mode='fan_in')
+    if conv.bias is not None:
+        nn.initializer.Constant(value=0.0)(conv.bias)
+
+
+def bn_init(bn, scale):
+    nn.initializer.Constant(value=float(scale))(bn.weight)
+    nn.initializer.Constant(value=0.0)(bn.bias)
+
+
+def einsum(x1, x3):
+    """paddle.einsum only support in dynamic graph mode.
+    x1 : n c u v
+    x2 : n c t v
+    """
+    n, c, u, v1 = x1.shape
+    n, c, t, v3 = x3.shape
+    assert (v1 == v3), "Args of einsum not match!"
+    x1 = paddle.transpose(x1, perm=[0, 1, 3, 2])  # n c v u
+    y = paddle.matmul(x3, x1)
+    # out: n c t u
+    return y
+
+
+class CTRGC(nn.Layer):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 rel_reduction=8,
+                 mid_reduction=1):
+        super(CTRGC, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if in_channels == 3 or in_channels == 9:
+            self.rel_channels = 8
+            self.mid_channels = 16
+        else:
+            self.rel_channels = in_channels // rel_reduction
+            self.mid_channels = in_channels // mid_reduction
+        self.conv1 = nn.Conv2D(self.in_channels,
+                               self.rel_channels,
+                               kernel_size=1)
+        self.conv2 = nn.Conv2D(self.in_channels,
+                               self.rel_channels,
+                               kernel_size=1)
+        self.conv3 = nn.Conv2D(self.in_channels,
+                               self.out_channels,
+                               kernel_size=1)
+        self.conv4 = nn.Conv2D(self.rel_channels,
+                               self.out_channels,
+                               kernel_size=1)
+        self.tanh = nn.Tanh()
+
+    def init_weights(self):
+        """Initiate the parameters.
+        """
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                conv_init(m)
+            elif isinstance(m, nn.BatchNorm2D):
+                bn_init(m, 1)
+
+    def forward(self, x, A=None, alpha=1):
+        x1, x2, x3 = self.conv1(x).mean(-2), self.conv2(x).mean(-2), self.conv3(
+            x)
+        x1 = self.tanh(x1.unsqueeze(-1) - x2.unsqueeze(-2))
+        x1 = self.conv4(x1) * alpha + (
+            A.unsqueeze(0).unsqueeze(0) if A is not None else 0)  # N,C,V,V
+        # We only support 'paddle.einsum()' in dynamic graph mode, if use in infer model please implement self.
+        # x1 = paddle.einsum('ncuv,nctv->nctu', x1, x3)
+        x1 = einsum(x1, x3)
+        return x1
+
+
+class TemporalConv(nn.Layer):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 dilation=1):
+        super(TemporalConv, self).__init__()
+        pad = (kernel_size + (kernel_size - 1) * (dilation - 1) - 1) // 2
+        self.conv = nn.Conv2D(in_channels,
+                              out_channels,
+                              kernel_size=(kernel_size, 1),
+                              padding=(pad, 0),
+                              stride=(stride, 1),
+                              dilation=(dilation, 1))
+
+        self.bn = nn.BatchNorm2D(out_channels)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+class MultiScale_TemporalConv(nn.Layer):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 dilations=[1, 2, 3, 4],
+                 residual=True,
+                 residual_kernel_size=1):
+
+        super(MultiScale_TemporalConv, self).__init__()
+        assert out_channels % (
+            len(dilations) +
+            2) == 0, '# out channels should be multiples of # branches'
+
+        # Multiple branches of temporal convolution
+        self.num_branches = len(dilations) + 2
+        branch_channels = out_channels // self.num_branches
+        if type(kernel_size) == list:
+            assert len(kernel_size) == len(dilations)
+        else:
+            kernel_size = [kernel_size] * len(dilations)
+        # Temporal Convolution branches
+        self.branches = nn.LayerList([
+            nn.Sequential(
+                nn.Conv2D(in_channels,
+                          branch_channels,
+                          kernel_size=1,
+                          padding=0),
+                nn.BatchNorm2D(branch_channels),
+                nn.ReLU(),
+                TemporalConv(branch_channels,
+                             branch_channels,
+                             kernel_size=ks,
+                             stride=stride,
+                             dilation=dilation),
+            ) for ks, dilation in zip(kernel_size, dilations)
+        ])
+
+        # Additional Max & 1x1 branch
+        self.branches.append(
+            nn.Sequential(
+                nn.Conv2D(in_channels,
+                          branch_channels,
+                          kernel_size=1,
+                          padding=0), nn.BatchNorm2D(branch_channels),
+                nn.ReLU(),
+                nn.MaxPool2D(kernel_size=(3, 1),
+                             stride=(stride, 1),
+                             padding=(1, 0)), nn.BatchNorm2D(branch_channels)))
+
+        self.branches.append(
+            nn.Sequential(
+                nn.Conv2D(in_channels,
+                          branch_channels,
+                          kernel_size=1,
+                          padding=0,
+                          stride=(stride, 1)), nn.BatchNorm2D(branch_channels)))
+
+        # Residual connection
+        if not residual:
+            self.residual = lambda x: 0
+        elif (in_channels == out_channels) and (stride == 1):
+            self.residual = lambda x: x
+        else:
+            self.residual = TemporalConv(in_channels,
+                                         out_channels,
+                                         kernel_size=residual_kernel_size,
+                                         stride=stride)
+
+    def init_weights(self):
+        """Initiate the parameters.
+        """
+        # initialize
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                conv_init(m)
+            elif isinstance(m, nn.BatchNorm2D):
+                weight_init_(m.weight, 'Normal', std=0.02, mean=1.0)
+                nn.initializer.Constant(value=0.0)(m.bias)
+
+    def forward(self, x):
+        # Input dim: (N,C,T,V)
+        res = self.residual(x)
+        branch_outs = []
+        for tempconv in self.branches:
+            out = tempconv(x)
+            branch_outs.append(out)
+
+        out = paddle.concat(branch_outs, axis=1)
+        out += res
+        return out
+
+
+class unit_tcn(nn.Layer):
+
+    def __init__(self, in_channels, out_channels, kernel_size=9, stride=1):
+        super(unit_tcn, self).__init__()
+        pad = int((kernel_size - 1) / 2)
+        self.conv = nn.Conv2D(in_channels,
+                              out_channels,
+                              kernel_size=(kernel_size, 1),
+                              padding=(pad, 0),
+                              stride=(stride, 1))
+
+        self.bn = nn.BatchNorm2D(out_channels)
+        self.relu = nn.ReLU()
+        conv_init(self.conv)
+        bn_init(self.bn, 1)
+
+    def forward(self, x):
+        x = self.bn(self.conv(x))
+        return x
+
+
+class unit_gcn(nn.Layer):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 A,
+                 coff_embedding=4,
+                 adaptive=True,
+                 residual=True):
+        super(unit_gcn, self).__init__()
+        inter_channels = out_channels // coff_embedding
+        self.inter_c = inter_channels
+        self.out_c = out_channels
+        self.in_c = in_channels
+        self.adaptive = adaptive
+        self.num_subset = A.shape[0]
+        self.convs = nn.LayerList()
+
+        for i in range(self.num_subset):
+            self.convs.append(CTRGC(in_channels, out_channels))
+
+        if residual:
+            if in_channels != out_channels:
+                self.down = nn.Sequential(
+                    nn.Conv2D(in_channels, out_channels, 1),
+                    nn.BatchNorm2D(out_channels))
+            else:
+                self.down = lambda x: x
+        else:
+            self.down = lambda x: 0
+        if self.adaptive:
+            pa_param = paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Assign(A.astype(np.float32)))
+            self.PA = paddle.create_parameter(shape=A.shape,
+                                              dtype='float32',
+                                              attr=pa_param)
+        else:
+            A_tensor = paddle.to_tensor(A, dtype="float32")
+            self.A = paddle.create_parameter(
+                shape=A_tensor.shape,
+                dtype='float32',
+                default_initializer=paddle.nn.initializer.Assign(A_tensor))
+            self.A.stop_gradient = True
+        alpha_tensor = paddle.to_tensor(np.zeros(1), dtype="float32")
+        self.alpha = paddle.create_parameter(
+            shape=alpha_tensor.shape,
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.Assign(alpha_tensor))
+        self.bn = nn.BatchNorm2D(out_channels)
+        self.soft = nn.Softmax(-2)
+        self.relu = nn.ReLU()
+
+    def init_weights(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                conv_init(m)
+            elif isinstance(m, nn.BatchNorm2D):
+                bn_init(m, 1)
+        bn_init(self.bn, 1e-6)
+
+    def forward(self, x):
+        y = None
+        if self.adaptive:
+            A = self.PA
+        else:
+            A = self.A.cuda(x.get_device())
+        for i in range(self.num_subset):
+            z = self.convs[i](x, A[i], self.alpha)
+            y = z + y if y is not None else z
+        y = self.bn(y)
+        y += self.down(x)
+        y = self.relu(y)
+        return y
+
+
+class TCN_GCN_unit(nn.Layer):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 A,
+                 stride=1,
+                 residual=True,
+                 adaptive=True,
+                 kernel_size=5,
+                 dilations=[1, 2]):
+        super(TCN_GCN_unit, self).__init__()
+        self.gcn1 = unit_gcn(in_channels, out_channels, A, adaptive=adaptive)
+        self.tcn1 = MultiScale_TemporalConv(out_channels,
+                                            out_channels,
+                                            kernel_size=kernel_size,
+                                            stride=stride,
+                                            dilations=dilations,
+                                            residual=False)
+        self.relu = nn.ReLU()
+        if not residual:
+            self.residual = lambda x: 0
+
+        elif (in_channels == out_channels) and (stride == 1):
+            self.residual = lambda x: x
+
+        else:
+            self.residual = unit_tcn(in_channels,
+                                     out_channels,
+                                     kernel_size=1,
+                                     stride=stride)
+
+    def forward(self, x):
+        y = self.relu(self.tcn1(self.gcn1(x)) + self.residual(x))
+        return y
+
+
+class NTUDGraph:
+
+    def __init__(self, labeling_mode='spatial'):
+        num_node = 25
+        self_link = [(i, i) for i in range(num_node)]
+        inward_ori_index = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5),
+                            (7, 6), (8, 7), (9, 21), (10, 9), (11, 10),
+                            (12, 11), (13, 1), (14, 13), (15, 14), (16, 15),
+                            (17, 1), (18, 17), (19, 18), (20, 19), (22, 23),
+                            (23, 8), (24, 25), (25, 12)]
+        inward = [(i - 1, j - 1) for (i, j) in inward_ori_index]
+        outward = [(j, i) for (i, j) in inward]
+        neighbor = inward + outward
+
+        self.num_node = num_node
+        self.self_link = self_link
+        self.inward = inward
+        self.outward = outward
+        self.neighbor = neighbor
+        self.A = self.get_adjacency_matrix(labeling_mode)
+
+    def edge2mat(self, link, num_node):
+        A = np.zeros((num_node, num_node))
+        for i, j in link:
+            A[j, i] = 1
+        return A
+
+    def normalize_digraph(self, A):
+        Dl = np.sum(A, 0)
+        h, w = A.shape
+        Dn = np.zeros((w, w))
+        for i in range(w):
+            if Dl[i] > 0:
+                Dn[i, i] = Dl[i]**(-1)
+        AD = np.dot(A, Dn)
+        return AD
+
+    def get_spatial_graph(self, num_node, self_link, inward, outward):
+        I = self.edge2mat(self_link, num_node)
+        In = self.normalize_digraph(self.edge2mat(inward, num_node))
+        Out = self.normalize_digraph(self.edge2mat(outward, num_node))
+        A = np.stack((I, In, Out))
+        return A
+
+    def get_adjacency_matrix(self, labeling_mode=None):
+        if labeling_mode is None:
+            return self.A
+        if labeling_mode == 'spatial':
+            A = self.get_spatial_graph(self.num_node, self.self_link,
+                                       self.inward, self.outward)
+        else:
+            raise ValueError()
+        return A
+
+
+@BACKBONES.register()
+class CTRGCN(nn.Layer):
+    """
+    CTR-GCN model from:
+    `"Channel-wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition" <https://arxiv.org/abs/2107.12213>`_
+    Args:
+        num_point: int, numbers of sketeton point.
+        num_person: int, numbers of person.
+        base_channel: int, model's hidden dim.
+        graph: str, sketeton adjacency matrix name.
+        graph_args: dict, sketeton adjacency graph class args.
+        in_channels: int, channels of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default 3.
+        adaptive: bool, if adjacency matrix can adaptive.
+    """
+
+    def __init__(self,
+                 num_point=25,
+                 num_person=2,
+                 base_channel=64,
+                 graph='ntu_rgb_d',
+                 graph_args=dict(),
+                 in_channels=3,
+                 adaptive=True):
+        super(CTRGCN, self).__init__()
+
+        if graph == 'ntu_rgb_d':
+            self.graph = NTUDGraph(**graph_args)
+        else:
+            raise ValueError()
+
+        A = self.graph.A  # 3,25,25
+
+        self.num_point = num_point
+        self.data_bn = nn.BatchNorm1D(num_person * in_channels * num_point)
+        self.base_channel = base_channel
+
+        self.l1 = TCN_GCN_unit(in_channels,
+                               self.base_channel,
+                               A,
+                               residual=False,
+                               adaptive=adaptive)
+        self.l2 = TCN_GCN_unit(self.base_channel,
+                               self.base_channel,
+                               A,
+                               adaptive=adaptive)
+        self.l3 = TCN_GCN_unit(self.base_channel,
+                               self.base_channel,
+                               A,
+                               adaptive=adaptive)
+        self.l4 = TCN_GCN_unit(self.base_channel,
+                               self.base_channel,
+                               A,
+                               adaptive=adaptive)
+        self.l5 = TCN_GCN_unit(self.base_channel,
+                               self.base_channel * 2,
+                               A,
+                               stride=2,
+                               adaptive=adaptive)
+        self.l6 = TCN_GCN_unit(self.base_channel * 2,
+                               self.base_channel * 2,
+                               A,
+                               adaptive=adaptive)
+        self.l7 = TCN_GCN_unit(self.base_channel * 2,
+                               self.base_channel * 2,
+                               A,
+                               adaptive=adaptive)
+        self.l8 = TCN_GCN_unit(self.base_channel * 2,
+                               self.base_channel * 4,
+                               A,
+                               stride=2,
+                               adaptive=adaptive)
+        self.l9 = TCN_GCN_unit(self.base_channel * 4,
+                               self.base_channel * 4,
+                               A,
+                               adaptive=adaptive)
+        self.l10 = TCN_GCN_unit(self.base_channel * 4,
+                                self.base_channel * 4,
+                                A,
+                                adaptive=adaptive)
+
+    def init_weights(self):
+        bn_init(self.data_bn, 1)
+
+    def forward(self, x):
+        N, C, T, V, M = x.shape
+        x = paddle.transpose(x, perm=[0, 4, 3, 1, 2])
+        x = paddle.reshape(x, (N, M * V * C, T))
+
+        x = self.data_bn(x)
+
+        x = paddle.reshape(x, (N, M, V, C, T))
+        x = paddle.transpose(x, perm=(0, 1, 3, 4, 2))
+
+        x = paddle.reshape(x, (N * M, C, T, V))
+
+        x = self.l1(x)
+        x = self.l2(x)
+        x = self.l3(x)
+        x = self.l4(x)
+        x = self.l5(x)
+        x = self.l6(x)
+        x = self.l7(x)
+        x = self.l8(x)
+        x = self.l9(x)
+        x = self.l10(x)
+
+        return x, N, M
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/darknet.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/darknet.py
new file mode 100644
index 0000000..3f48bf6
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/darknet.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter_size,
+                 stride,
+                 padding,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = nn.Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            weight_attr=ParamAttr(name=name + ".conv.weights"),
+            bias_attr=False)
+
+        bn_name = name + ".bn"
+        self._bn = nn.BatchNorm(
+            num_channels=output_channels,
+            act="leaky_relu",
+            param_attr=ParamAttr(name=bn_name + ".scale"),
+            bias_attr=ParamAttr(name=bn_name + ".offset"),
+            moving_mean_name=bn_name + ".mean",
+            moving_variance_name=bn_name + ".var")
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        x = self._bn(x)
+        return x
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self, input_channels, output_channels, name=None):
+        super(BasicBlock, self).__init__()
+
+        self._conv1 = ConvBNLayer(input_channels=input_channels, output_channels=output_channels, filter_size=[
+                                  3, 3], stride=1, padding=1,  name=name+'.0')
+        self._max_pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
+        self._conv2 = ConvBNLayer(input_channels=output_channels, output_channels=output_channels *
+                                  2, filter_size=[3, 3], stride=1, padding=1, name=name+'.1')
+        self._conv3 = ConvBNLayer(input_channels=output_channels*2, output_channels=output_channels,
+                                  filter_size=[1, 1], stride=1, padding=0, name=name+'.2')
+
+    def forward(self, x):
+        x = self._conv1(x)
+        x = self._max_pool(x)
+        x = self._conv2(x)
+        x = self._conv3(x)
+        return x
+
+
+class Reorg(nn.Layer):
+    def __init__(self, stride=2):
+        super(Reorg, self).__init__()
+        self.stride = stride
+
+    def forward(self, x):
+        stride = self.stride
+        assert (x.dim() == 4)
+        B = x.shape[0]
+        C = x.shape[1]
+        H = x.shape[2]
+        W = x.shape[3]
+        assert (H % stride == 0)
+        assert (W % stride == 0)
+        ws = stride
+        hs = stride
+        x = x.reshape([B, C, H // hs, hs, W // ws, ws]
+                      ).transpose([0, 1, 2, 4, 3, 5])
+        x = x.reshape([B, C, H // hs * W // ws, hs * ws]
+                      ).transpose([0, 1, 3, 2])
+        x = x.reshape([B, C, hs * ws, H // hs, W // ws]
+                      ).transpose([0, 2, 1, 3, 4])
+        x = x.reshape([B, hs * ws * C, H // hs, W // ws])
+        return x
+
+
+class Darknet(nn.Layer):
+    def __init__(self, pretrained=None):
+        super(Darknet, self).__init__()
+        self.pretrained = pretrained
+        self._conv1 = ConvBNLayer(
+            input_channels=3, output_channels=32, filter_size=3, stride=1, padding=1, name='input')
+        self._max_pool1 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
+        self._basic_block_11 = BasicBlock(
+            input_channels=32, output_channels=64, name='1.1')
+        self._basic_block_12 = BasicBlock(
+            input_channels=64, output_channels=128, name='1.2')
+        self._basic_block_13 = BasicBlock(
+            input_channels=128, output_channels=256, name='1.3')
+        self._conv2 = ConvBNLayer(
+            input_channels=256, output_channels=512, filter_size=3, stride=1, padding=1, name='up1')
+        self._conv3 = ConvBNLayer(
+            input_channels=512, output_channels=256, filter_size=1, stride=1, padding=0, name='down1')
+        self._conv4 = ConvBNLayer(
+            input_channels=256, output_channels=512, filter_size=3, stride=1, padding=1, name='2.1')
+        self._max_pool2 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
+        self._conv5 = ConvBNLayer(
+            input_channels=512, output_channels=1024, filter_size=3, stride=1, padding=1, name='2.2')
+        self._conv6 = ConvBNLayer(input_channels=1024, output_channels=512,
+                                  filter_size=1, stride=1, padding=0, name='2.3')  # ori
+        self._conv7 = ConvBNLayer(
+            input_channels=512, output_channels=1024, filter_size=3, stride=1, padding=1, name='up2')
+        self._conv8 = ConvBNLayer(input_channels=1024, output_channels=512,
+                                  filter_size=1, stride=1, padding=0, name='down2')
+        self._conv9 = ConvBNLayer(
+            input_channels=512, output_channels=1024, filter_size=3, stride=1, padding=1, name='3.1')
+        self._conv10 = ConvBNLayer(
+            input_channels=1024, output_channels=1024, filter_size=3, stride=1, padding=1, name='3.2')
+        self._conv11 = ConvBNLayer(
+            input_channels=1024, output_channels=1024, filter_size=3, stride=1, padding=1, name='3.3')
+        self._conv12 = ConvBNLayer(
+            input_channels=512, output_channels=64, filter_size=1, stride=1, padding=0, name='4.1')
+        self._reorg = Reorg()
+        self._conv13 = ConvBNLayer(
+            input_channels=1280, output_channels=1024, filter_size=3, stride=1, padding=1, name='5.1')
+        self._conv14 = nn.Conv2D(1024, 425, kernel_size=1)
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._max_pool1(x)
+        x = self._basic_block_11(x)
+        x = self._basic_block_12(x)
+        x = self._basic_block_13(x)
+        x = self._conv2(x)
+        x = self._conv3(x)
+        ori = self._conv4(x)
+        x = self._max_pool2(ori)
+        x = self._conv5(x)
+        x = self._conv6(x)
+        x = self._conv7(x)
+        x = self._conv8(x)
+        x = self._conv9(x)
+        x = self._conv10(x)
+        x1 = self._conv11(x)
+        x2 = self._conv12(ori)
+        x2 = self._reorg(x2)
+        x = paddle.concat([x2, x1], 1)
+        x = self._conv13(x)
+        x = self._conv14(x)
+        return x
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/deeplab.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/deeplab.py
new file mode 100644
index 0000000..c566205
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/deeplab.py
@@ -0,0 +1,454 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import copy
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ..registry import BACKBONES
+
+
+class FrozenBatchNorm2D(nn.Layer):
+    """
+    BatchNorm2D where the batch statistics and the affine parameters
+    are fixed
+    """
+    def __init__(self, n, epsilon=1e-5):
+        super(FrozenBatchNorm2D, self).__init__()
+        x1 = paddle.ones([n])
+        x2 = paddle.zeros([n])
+        weight = self.create_parameter(
+            shape=x1.shape, default_initializer=nn.initializer.Assign(x1))
+        bias = self.create_parameter(
+            shape=x2.shape, default_initializer=nn.initializer.Assign(x2))
+        running_mean = self.create_parameter(
+            shape=x2.shape, default_initializer=nn.initializer.Assign(x2))
+        running_var = self.create_parameter(
+            shape=x1.shape, default_initializer=nn.initializer.Assign(x1))
+        self.add_parameter('weight', weight)
+        self.add_parameter('bias', bias)
+        self.add_parameter('running_mean', running_mean)
+        self.add_parameter('running_var', running_var)
+        self.epsilon = epsilon
+
+    def forward(self, x):
+        scale = self.weight * paddle.rsqrt((self.running_var + self.epsilon))
+        bias = self.bias - self.running_mean * scale
+        scale = paddle.reshape(scale, [1, -1, 1, 1])
+        bias = paddle.reshape(bias, [1, -1, 1, 1])
+        return x * scale + bias
+
+
+class Bottleneck(nn.Layer):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 BatchNorm=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)
+        self.bn1 = BatchNorm(planes)
+        self.conv2 = nn.Conv2D(planes,
+                               planes,
+                               kernel_size=3,
+                               stride=stride,
+                               dilation=dilation,
+                               padding=dilation,
+                               bias_attr=False)
+        self.bn2 = BatchNorm(planes)
+        self.conv3 = nn.Conv2D(planes,
+                               planes * 4,
+                               kernel_size=1,
+                               bias_attr=False)
+        self.bn3 = BatchNorm(planes * 4)
+        self.relu = nn.ReLU()
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Layer):
+    def __init__(self,
+                 block,
+                 layers,
+                 output_stride,
+                 BatchNorm,
+                 pretrained=False):
+        self.inplanes = 64
+        super(ResNet, self).__init__()
+        blocks = [1, 2, 4]
+        if output_stride == 16:
+            strides = [1, 2, 2, 1]
+            dilations = [1, 1, 1, 2]
+        elif output_stride == 8:
+            strides = [1, 2, 1, 1]
+            dilations = [1, 1, 2, 4]
+        else:
+            raise NotImplementedError
+
+        # Modules
+        self.conv1 = nn.Conv2D(3,
+                               64,
+                               kernel_size=7,
+                               stride=2,
+                               padding=3,
+                               bias_attr=False)
+        self.bn1 = BatchNorm(64)
+        self.relu = nn.ReLU()
+        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.layer1 = self._make_layer(block,
+                                       64,
+                                       layers[0],
+                                       stride=strides[0],
+                                       dilation=dilations[0],
+                                       BatchNorm=BatchNorm)
+        self.layer2 = self._make_layer(block,
+                                       128,
+                                       layers[1],
+                                       stride=strides[1],
+                                       dilation=dilations[1],
+                                       BatchNorm=BatchNorm)
+        self.layer3 = self._make_layer(block,
+                                       256,
+                                       layers[2],
+                                       stride=strides[2],
+                                       dilation=dilations[2],
+                                       BatchNorm=BatchNorm)
+        self.layer4 = self._make_MG_unit(block,
+                                         512,
+                                         blocks=blocks,
+                                         stride=strides[3],
+                                         dilation=dilations[3],
+                                         BatchNorm=BatchNorm)
+        self._init_weight()
+
+    def _make_layer(self,
+                    block,
+                    planes,
+                    blocks,
+                    stride=1,
+                    dilation=1,
+                    BatchNorm=None):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2D(self.inplanes,
+                          planes * block.expansion,
+                          kernel_size=1,
+                          stride=stride,
+                          bias_attr=False),
+                BatchNorm(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, dilation, downsample,
+                  BatchNorm))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(self.inplanes,
+                      planes,
+                      dilation=dilation,
+                      BatchNorm=BatchNorm))
+
+        return nn.Sequential(*layers)
+
+    def _make_MG_unit(self,
+                      block,
+                      planes,
+                      blocks,
+                      stride=1,
+                      dilation=1,
+                      BatchNorm=None):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2D(self.inplanes,
+                          planes * block.expansion,
+                          kernel_size=1,
+                          stride=stride,
+                          bias_attr=False),
+                BatchNorm(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(self.inplanes,
+                  planes,
+                  stride,
+                  dilation=blocks[0] * dilation,
+                  downsample=downsample,
+                  BatchNorm=BatchNorm))
+        self.inplanes = planes * block.expansion
+        for i in range(1, len(blocks)):
+            layers.append(
+                block(self.inplanes,
+                      planes,
+                      stride=1,
+                      dilation=blocks[i] * dilation,
+                      BatchNorm=BatchNorm))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, input, return_mid_level=False):
+        x = self.conv1(input)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        low_level_feat = x
+        x = self.layer2(x)
+        mid_level_feat = x
+        x = self.layer3(x)
+        x = self.layer4(x)
+        if return_mid_level:
+            return x, low_level_feat, mid_level_feat
+        else:
+            return x, low_level_feat
+
+    def _init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                nn.initializer.KaimingNormal()
+            elif isinstance(m, nn.GroupNorm):
+                m.weight.data = nn.initializer.Constant(1)
+                m.bias.data = nn.initializer.Constant(0)
+
+
+class _ASPPModule(nn.Layer):
+    def __init__(self, inplanes, planes, kernel_size, padding, dilation,
+                 BatchNorm):
+        super(_ASPPModule, self).__init__()
+        self.atrous_conv = nn.Conv2D(inplanes,
+                                     planes,
+                                     kernel_size=kernel_size,
+                                     stride=1,
+                                     padding=padding,
+                                     dilation=dilation,
+                                     bias_attr=False)
+        self.bn = BatchNorm(planes)
+        self.relu = nn.ReLU()
+
+        self._init_weight()
+
+    def forward(self, x):
+        x = self.atrous_conv(x)
+        x = self.bn(x)
+
+        return self.relu(x)
+
+    def _init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                m.weight_attr = nn.initializer.KaimingNormal()
+            elif isinstance(m, nn.BatchNorm2D):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+
+class ASPP(nn.Layer):
+    def __init__(self, backbone, output_stride, BatchNorm):
+        super(ASPP, self).__init__()
+        if backbone == 'drn':
+            inplanes = 512
+        elif backbone == 'mobilenet':
+            inplanes = 320
+        else:
+            inplanes = 2048
+        if output_stride == 16:
+            dilations = [1, 6, 12, 18]
+        elif output_stride == 8:
+            dilations = [1, 12, 24, 36]
+        else:
+            raise NotImplementedError
+
+        self.aspp1 = _ASPPModule(inplanes,
+                                 256,
+                                 1,
+                                 padding=0,
+                                 dilation=dilations[0],
+                                 BatchNorm=BatchNorm)
+        self.aspp2 = _ASPPModule(inplanes,
+                                 256,
+                                 3,
+                                 padding=dilations[1],
+                                 dilation=dilations[1],
+                                 BatchNorm=BatchNorm)
+        self.aspp3 = _ASPPModule(inplanes,
+                                 256,
+                                 3,
+                                 padding=dilations[2],
+                                 dilation=dilations[2],
+                                 BatchNorm=BatchNorm)
+        self.aspp4 = _ASPPModule(inplanes,
+                                 256,
+                                 3,
+                                 padding=dilations[3],
+                                 dilation=dilations[3],
+                                 BatchNorm=BatchNorm)
+
+        self.global_avg_pool = nn.Sequential(
+            nn.AdaptiveAvgPool2D((1, 1)),
+            nn.Conv2D(inplanes, 256, 1, stride=1, bias_attr=False),
+            BatchNorm(256), nn.ReLU())
+        self.conv1 = nn.Conv2D(1280, 256, 1, bias_attr=False)
+        self.bn1 = BatchNorm(256)
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(0.1)
+        self._init_weight()
+
+    def forward(self, x):
+        x1 = self.aspp1(x)
+        x2 = self.aspp2(x)
+        x3 = self.aspp3(x)
+        x4 = self.aspp4(x)
+        x5 = self.global_avg_pool(x)
+        x5 = F.interpolate(x5,
+                           size=x4.shape[2:],
+                           mode='bilinear',
+                           align_corners=True)
+        x = paddle.concat(x=[x1, x2, x3, x4, x5], axis=1)
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        return self.dropout(x)
+
+    def _init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                nn.initializer.KaimingNormal()
+            elif isinstance(m, nn.GroupNorm):
+                m.weight.data = nn.initializer.Constant(1)
+                m.bias.data = nn.initializer.Constant(0)
+
+
+class Decoder(nn.Layer):
+    def __init__(self, backbone, BatchNorm):
+        super(Decoder, self).__init__()
+        if backbone == 'resnet':
+            low_level_inplanes = 256
+        elif backbone == 'mobilenet':
+            raise NotImplementedError
+        else:
+            raise NotImplementedError
+
+        self.conv1 = nn.Conv2D(low_level_inplanes, 48, 1, bias_attr=False)
+        self.bn1 = BatchNorm(48)
+        self.relu = nn.ReLU()
+
+        self.last_conv = nn.Sequential(
+            nn.Conv2D(304,
+                      256,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias_attr=False), BatchNorm(256), nn.ReLU(),
+            nn.Sequential(),
+            nn.Conv2D(256,
+                      256,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias_attr=False), BatchNorm(256), nn.ReLU(),
+            nn.Sequential())
+
+        self._init_weight()
+
+    def forward(self, x, low_level_feat):
+        low_level_feat = self.conv1(low_level_feat)
+        low_level_feat = self.bn1(low_level_feat)
+        low_level_feat = self.relu(low_level_feat)
+
+        x = F.interpolate(x,
+                          size=low_level_feat.shape[2:],
+                          mode='bilinear',
+                          align_corners=True)
+        x = paddle.concat(x=[x, low_level_feat], axis=1)
+        x = self.last_conv(x)
+
+        return x
+
+    def _init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                nn.initializer.KaimingNormal()
+            elif isinstance(m, nn.GroupNorm):
+                m.weight.data = nn.initializer.Constant(1)
+                m.bias.data = nn.initializer.Constant(0)
+
+
+class DeepLab(nn.Layer):
+    """DeepLab model for segmentation"""
+    def __init__(self, backbone='resnet', output_stride=16, freeze_bn=True):
+        super(DeepLab, self).__init__()
+
+        if freeze_bn == True:
+            print("Use frozen BN in DeepLab!")
+            BatchNorm = FrozenBatchNorm2D
+        else:
+            BatchNorm = nn.BatchNorm2D
+
+        self.backbone = ResNet(Bottleneck, [3, 4, 23, 3],
+                               output_stride,
+                               BatchNorm,
+                               pretrained=True)
+        self.aspp = ASPP(backbone, output_stride, BatchNorm)
+        self.decoder = Decoder(backbone, BatchNorm)
+
+    def forward(self, input, return_aspp=False):
+        """forward function"""
+        if return_aspp:
+            x, low_level_feat, mid_level_feat = self.backbone(input, True)
+        else:
+            x, low_level_feat = self.backbone(input)
+        aspp_x = self.aspp(x)
+        x = self.decoder(aspp_x, low_level_feat)
+
+        if return_aspp:
+            return x, aspp_x, low_level_feat, mid_level_feat
+        else:
+            return x, low_level_feat
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/movinet.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/movinet.py
new file mode 100644
index 0000000..cb6d4fd
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/movinet.py
@@ -0,0 +1,574 @@
+import collections.abc
+from itertools import repeat
+from typing import Any, Callable, Optional, Tuple, Union
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.layer import Identity
+
+from ..registry import BACKBONES
+from collections import OrderedDict
+
+container_abcs = collections.abc
+"""Model Config
+"""
+
+A0 = {'block_num': [0, 1, 3, 3, 4, 4]}
+A0['conv1'] = [3, 8, (1, 3, 3), (1, 2, 2), (0, 1, 1)]
+A0['b2_l0'] = [8, 8, 24, (1, 5, 5), (1, 2, 2), (0, 2, 2), (0, 1, 1)]
+A0['b3_l0'] = [8, 32, 80, (3, 3, 3), (1, 2, 2), (1, 0, 0), (0, 0, 0)]
+A0['b3_l1'] = [32, 32, 80, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]
+A0['b3_l2'] = [32, 32, 80, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]
+A0['b4_l0'] = [32, 56, 184, (5, 3, 3), (1, 2, 2), (2, 0, 0), (0, 0, 0)]
+A0['b4_l1'] = [56, 56, 112, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]
+A0['b4_l2'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]
+A0['b5_l0'] = [56, 56, 184, (5, 3, 3), (1, 1, 1), (2, 1, 1), (0, 1, 1)]
+A0['b5_l1'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]
+A0['b5_l2'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]
+A0['b5_l3'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)]
+A0['b6_l0'] = [56, 104, 384, (5, 3, 3), (1, 2, 2), (2, 1, 1), (0, 1, 1)]
+A0['b6_l1'] = [104, 104, 280, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1)]
+A0['b6_l2'] = [104, 104, 280, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1)]
+A0['b6_l3'] = [104, 104, 344, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1)]
+A0['conv7'] = [104, 480, (1, 1, 1), (1, 1, 1), (0, 0, 0)]
+
+MODEL_CONFIG = {'A0': A0}
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, container_abcs.Iterable):
+            return x
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+def _make_divisible(v: float,
+                    divisor: int,
+                    min_value: Optional[int] = None) -> int:
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8.
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+_single = _ntuple(1)
+_pair = _ntuple(2)
+_triple = _ntuple(3)
+_quadruple = _ntuple(4)
+
+
+class CausalModule(nn.Layer):
+    def __init__(self) -> None:
+        super().__init__()
+        self.activation = None
+
+    def reset_activation(self) -> None:
+        self.activation = None
+
+
+class Conv2dBNActivation(nn.Sequential):
+    def __init__(
+        self,
+        in_planes: int,
+        out_planes: int,
+        kernel_size: Union[int, Tuple[int, int]],
+        padding: Union[int, Tuple[int, int]],
+        stride: Union[int, Tuple[int, int]] = 1,
+        groups: int = 1,
+        norm_layer: Optional[Callable[..., nn.Layer]] = None,
+        activation_layer: Optional[Callable[..., nn.Layer]] = None,
+        **kwargs: Any,
+    ) -> None:
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+        if norm_layer is None:
+            norm_layer = Identity
+        if activation_layer is None:
+            activation_layer = Identity
+        self.kernel_size = kernel_size
+        self.stride = stride
+        dict_layers = (nn.Conv2D(in_planes,
+                                 out_planes,
+                                 kernel_size=kernel_size,
+                                 stride=stride,
+                                 padding=padding,
+                                 groups=groups,
+                                 **kwargs), norm_layer(out_planes,
+                                                       momentum=0.1),
+                       activation_layer())
+
+        self.out_channels = out_planes
+        super(Conv2dBNActivation, self).__init__(dict_layers[0], dict_layers[1],
+                                                 dict_layers[2])
+
+
+class Conv3DBNActivation(nn.Sequential):
+    def __init__(
+        self,
+        in_planes: int,
+        out_planes: int,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        padding: Union[int, Tuple[int, int, int]],
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        groups: int = 1,
+        norm_layer: Optional[Callable[..., nn.Layer]] = None,
+        activation_layer: Optional[Callable[..., nn.Layer]] = None,
+        **kwargs: Any,
+    ) -> None:
+        kernel_size = _triple(kernel_size)
+        stride = _triple(stride)
+        padding = _triple(padding)
+        if norm_layer is None:
+            norm_layer = Identity
+        if activation_layer is None:
+            activation_layer = Identity
+        self.kernel_size = kernel_size
+        self.stride = stride
+
+        dict_layers = (nn.Conv3D(in_planes,
+                                 out_planes,
+                                 kernel_size=kernel_size,
+                                 stride=stride,
+                                 padding=padding,
+                                 groups=groups,
+                                 **kwargs), norm_layer(out_planes,
+                                                       momentum=0.1),
+                       activation_layer())
+        self.out_channels = out_planes
+        super(Conv3DBNActivation, self).__init__(dict_layers[0], dict_layers[1],
+                                                 dict_layers[2])
+
+
+class ConvBlock3D(CausalModule):
+    def __init__(
+        self,
+        in_planes: int,
+        out_planes: int,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        causal: bool,
+        conv_type: str,
+        padding: Union[int, Tuple[int, int, int]] = 0,
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        norm_layer: Optional[Callable[..., nn.Layer]] = None,
+        activation_layer: Optional[Callable[..., nn.Layer]] = None,
+        bias_attr: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__()
+        kernel_size = _triple(kernel_size)
+        stride = _triple(stride)
+        padding = _triple(padding)
+        self.conv_2 = None
+
+        if causal is True:
+            padding = (0, padding[1], padding[2])
+        if conv_type != "2plus1d" and conv_type != "3d":
+            raise ValueError("only 2plus2d or 3d are " +
+                             "allowed as 3d convolutions")
+
+        if conv_type == "2plus1d":
+            self.conv_1 = Conv2dBNActivation(in_planes,
+                                             out_planes,
+                                             kernel_size=(kernel_size[1],
+                                                          kernel_size[2]),
+                                             padding=(padding[1], padding[2]),
+                                             stride=(stride[1], stride[2]),
+                                             activation_layer=activation_layer,
+                                             norm_layer=norm_layer,
+                                             bias_attr=bias_attr,
+                                             **kwargs)
+            if kernel_size[0] > 1:
+                self.conv_2 = Conv2dBNActivation(
+                    in_planes,
+                    out_planes,
+                    kernel_size=(kernel_size[0], 1),
+                    padding=(padding[0], 0),
+                    stride=(stride[0], 1),
+                    activation_layer=activation_layer,
+                    norm_layer=norm_layer,
+                    bias_attr=bias_attr,
+                    **kwargs)
+        elif conv_type == "3d":
+            self.conv_1 = Conv3DBNActivation(in_planes,
+                                             out_planes,
+                                             kernel_size=kernel_size,
+                                             padding=padding,
+                                             activation_layer=activation_layer,
+                                             norm_layer=norm_layer,
+                                             stride=stride,
+                                             bias_attr=bias_attr,
+                                             **kwargs)
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.dim_pad = self.kernel_size[0] - 1
+        self.stride = stride
+        self.causal = causal
+        self.conv_type = conv_type
+
+    def _forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        if self.dim_pad > 0 and self.conv_2 is None and self.causal is True:
+            x = self._cat_stream_buffer(x)
+        b, c, t, h, w = x.shape
+        if self.conv_type == "2plus1d":
+            x = paddle.transpose(x, (0, 2, 1, 3, 4))  # bcthw --> btchw
+            x = paddle.reshape_(x, (-1, c, h, w))  # btchw --> bt,c,h,w
+        x = self.conv_1(x)
+        if self.conv_type == "2plus1d":
+            b, c, h, w = x.shape
+            x = paddle.reshape_(x, (-1, t, c, h, w))  # bt,c,h,w --> b,t,c,h,w
+            x = paddle.transpose(x, (0, 2, 1, 3, 4))  # b,t,c,h,w --> b,c,t,h,w
+            if self.conv_2 is not None:
+                if self.dim_pad > 0 and self.causal is True:
+                    x = self._cat_stream_buffer(x)
+                b, c, t, h, w = x.shape
+                x = paddle.reshape_(x, (b, c, t, h * w))
+                x = self.conv_2(x)
+                b, c, t, _ = x.shape
+                x = paddle.reshape_(x, (b, c, t, h, w))
+        return x
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        x = self._forward(x)
+        return x
+
+    def _cat_stream_buffer(self, x: paddle.Tensor) -> paddle.Tensor:
+        if self.activation is None:
+            self._setup_activation(x.shape)
+        x = paddle.concat((self.activation, x), 2)
+        self._save_in_activation(x)
+        return x
+
+    def _save_in_activation(self, x: paddle.Tensor) -> None:
+        assert self.dim_pad > 0
+        self.activation = paddle.to_tensor(x.numpy()[:, :, -self.dim_pad:,
+                                                     ...]).clone().detach()
+
+    def _setup_activation(self, input_shape: Tuple[float, ...]) -> None:
+        assert self.dim_pad > 0
+        self.activation = paddle.zeros(shape=[
+            *input_shape[:2],  # type: ignore
+            self.dim_pad,
+            *input_shape[3:]
+        ])
+
+
+class TemporalCGAvgPool3D(CausalModule):
+    def __init__(self, ) -> None:
+        super().__init__()
+        self.n_cumulated_values = 0
+        self.register_forward_post_hook(self._detach_activation)
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        input_shape = x.shape
+        cumulative_sum = paddle.cumsum(x, axis=2)
+        if self.activation is None:
+            self.activation = cumulative_sum[:, :, -1:].clone()
+        else:
+            cumulative_sum += self.activation
+            self.activation = cumulative_sum[:, :, -1:].clone()
+
+        noe = paddle.arange(1, input_shape[2] + 1)
+        axis = paddle.to_tensor([0, 1, 3, 4])
+        noe = paddle.unsqueeze(noe, axis=axis)
+        divisor = noe.expand(x.shape)
+        x = cumulative_sum / (self.n_cumulated_values + divisor)
+        self.n_cumulated_values += input_shape[2]
+        return x
+
+    @staticmethod
+    def _detach_activation(module: CausalModule, inputs: paddle.Tensor,
+                           output: paddle.Tensor) -> None:
+        module.activation.detach()
+
+    def reset_activation(self) -> None:
+        super().reset_activation()
+        self.n_cumulated_values = 0
+
+
+class SqueezeExcitation(nn.Layer):
+    def __init__(self,
+                 input_channels: int,
+                 activation_2: nn.Layer,
+                 activation_1: nn.Layer,
+                 conv_type: str,
+                 causal: bool,
+                 squeeze_factor: int = 4,
+                 bias_attr: bool = True) -> None:
+        super().__init__()
+        self.causal = causal
+        se_multiplier = 2 if causal else 1
+        squeeze_channels = _make_divisible(
+            input_channels // squeeze_factor * se_multiplier, 8)
+        self.temporal_cumualtive_GAvg3D = TemporalCGAvgPool3D()
+        self.fc1 = ConvBlock3D(input_channels * se_multiplier,
+                               squeeze_channels,
+                               kernel_size=(1, 1, 1),
+                               padding=0,
+                               causal=causal,
+                               conv_type=conv_type,
+                               bias_attr=bias_attr)
+        self.activation_1 = activation_1()
+        self.activation_2 = activation_2()
+        self.fc2 = ConvBlock3D(squeeze_channels,
+                               input_channels,
+                               kernel_size=(1, 1, 1),
+                               padding=0,
+                               causal=causal,
+                               conv_type=conv_type,
+                               bias_attr=bias_attr)
+
+    def _scale(self, inputs: paddle.Tensor) -> paddle.Tensor:
+        if self.causal:
+            x_space = paddle.mean(inputs, axis=[3, 4], keepdim=True)
+            scale = self.temporal_cumualtive_GAvg3D(x_space)
+            scale = paddle.concat((scale, x_space), axis=1)
+        else:
+            scale = F.adaptive_avg_pool3d(inputs, 1)
+        scale = self.fc1(scale)
+        scale = self.activation_1(scale)
+        scale = self.fc2(scale)
+        return self.activation_2(scale)
+
+    def forward(self, inputs: paddle.Tensor) -> paddle.Tensor:
+        scale = self._scale(inputs)
+        return scale * inputs
+
+
+class BasicBneck(nn.Layer):
+    def __init__(
+        self,
+        input_channels,
+        out_channels,
+        expanded_channels,
+        kernel_size,
+        stride,
+        padding,
+        padding_avg,
+        causal: bool,
+        conv_type: str,
+        norm_layer: Optional[Callable[..., nn.Layer]] = None,
+        activation_layer: Optional[Callable[..., nn.Layer]] = None,
+    ) -> None:
+        super().__init__()
+
+        assert type(stride) is tuple
+
+        if (not stride[0] == 1 or not (1 <= stride[1] <= 2)
+                or not (1 <= stride[2] <= 2)):
+            raise ValueError('illegal stride value')
+
+        self.res = None
+
+        layers = []
+        if expanded_channels != out_channels:
+            # expand
+            self.expand = ConvBlock3D(in_planes=input_channels,
+                                      out_planes=expanded_channels,
+                                      kernel_size=(1, 1, 1),
+                                      padding=(0, 0, 0),
+                                      causal=causal,
+                                      conv_type=conv_type,
+                                      norm_layer=norm_layer,
+                                      activation_layer=activation_layer)
+        # deepwise
+        self.deep = ConvBlock3D(in_planes=expanded_channels,
+                                out_planes=expanded_channels,
+                                kernel_size=kernel_size,
+                                padding=padding,
+                                stride=stride,
+                                groups=expanded_channels,
+                                causal=causal,
+                                conv_type=conv_type,
+                                norm_layer=norm_layer,
+                                activation_layer=activation_layer)
+
+        # SE
+        self.se = SqueezeExcitation(
+            expanded_channels,
+            causal=causal,
+            activation_1=activation_layer,
+            activation_2=(nn.Sigmoid if conv_type == "3d" else nn.Hardsigmoid),
+            conv_type=conv_type)
+        # project
+        self.project = ConvBlock3D(expanded_channels,
+                                   out_channels,
+                                   kernel_size=(1, 1, 1),
+                                   padding=(0, 0, 0),
+                                   causal=causal,
+                                   conv_type=conv_type,
+                                   norm_layer=norm_layer,
+                                   activation_layer=Identity)
+
+        if not (stride == (1, 1, 1) and input_channels == out_channels):
+            if stride != (1, 1, 1):
+                layers.append(
+                    nn.AvgPool3D((1, 3, 3), stride=stride, padding=padding_avg))
+            layers.append(
+                ConvBlock3D(
+                    in_planes=input_channels,
+                    out_planes=out_channels,
+                    kernel_size=(1, 1, 1),
+                    padding=(0, 0, 0),
+                    norm_layer=norm_layer,
+                    activation_layer=Identity,
+                    causal=causal,
+                    conv_type=conv_type,
+                ))
+            self.res = nn.Sequential(*layers)
+        self.alpha = self.create_parameter(shape=[1], dtype="float32")
+
+    def forward(self, inputs: paddle.Tensor) -> paddle.Tensor:
+        if self.res is not None:
+            residual = self.res(inputs)
+        else:
+            residual = inputs
+        if self.expand is not None:
+            x = self.expand(inputs)
+        else:
+            x = inputs
+
+        x = self.deep(x)
+        x = self.se(x)
+        x = self.project(x)
+        result = residual + self.alpha * x
+        return result
+
+
+@BACKBONES.register()
+class MoViNet(nn.Layer):
+    def __init__(
+        self,
+        model_type: str = 'A0',
+        hidden_dim: int = 2048,
+        causal: bool = True,
+        num_classes: int = 400,
+        conv_type: str = "3d",
+    ) -> None:
+        super().__init__()
+        """
+        causal: causal mode
+        num_classes: number of classes for classifcation
+        conv_type: type of convolution either 3d or 2plus1d
+        """
+        blocks_dic = OrderedDict()
+        cfg = MODEL_CONFIG[model_type]
+
+        norm_layer = nn.BatchNorm3D if conv_type == "3d" else nn.BatchNorm2D
+        activation_layer = nn.Swish if conv_type == "3d" else nn.Hardswish
+
+        # conv1
+        self.conv1 = ConvBlock3D(in_planes=cfg['conv1'][0],
+                                 out_planes=cfg['conv1'][1],
+                                 kernel_size=cfg['conv1'][2],
+                                 stride=cfg['conv1'][3],
+                                 padding=cfg['conv1'][4],
+                                 causal=causal,
+                                 conv_type=conv_type,
+                                 norm_layer=norm_layer,
+                                 activation_layer=activation_layer)
+        # blocks
+        for i in range(2, len(cfg['block_num']) + 1):
+            for j in range(cfg['block_num'][i - 1]):
+                blocks_dic[f'b{i}_l{j}'] = BasicBneck(
+                    cfg[f'b{i}_l{j}'][0],
+                    cfg[f'b{i}_l{j}'][1],
+                    cfg[f'b{i}_l{j}'][2],
+                    cfg[f'b{i}_l{j}'][3],
+                    cfg[f'b{i}_l{j}'][4],
+                    cfg[f'b{i}_l{j}'][5],
+                    cfg[f'b{i}_l{j}'][6],
+                    causal=causal,
+                    conv_type=conv_type,
+                    norm_layer=norm_layer,
+                    activation_layer=activation_layer)
+        self.blocks = nn.Sequential(*(blocks_dic.values()))
+
+        # conv7
+        self.conv7 = ConvBlock3D(in_planes=cfg['conv7'][0],
+                                 out_planes=cfg['conv7'][1],
+                                 kernel_size=cfg['conv7'][2],
+                                 stride=cfg['conv7'][3],
+                                 padding=cfg['conv7'][4],
+                                 causal=causal,
+                                 conv_type=conv_type,
+                                 norm_layer=norm_layer,
+                                 activation_layer=activation_layer)
+        # pool
+        self.classifier = nn.Sequential(
+            # dense9
+            ConvBlock3D(in_planes=cfg['conv7'][1],
+                        out_planes=hidden_dim,
+                        kernel_size=(1, 1, 1),
+                        causal=causal,
+                        conv_type=conv_type,
+                        bias_attr=True),
+            nn.Swish(),
+            nn.Dropout(p=0.2),
+            # dense10d
+            ConvBlock3D(in_planes=hidden_dim,
+                        out_planes=num_classes,
+                        kernel_size=(1, 1, 1),
+                        causal=causal,
+                        conv_type=conv_type,
+                        bias_attr=True),
+        )
+        if causal:
+            self.cgap = TemporalCGAvgPool3D()
+        self.apply(self._weight_init)
+        self.causal = causal
+
+    def avg(self, x: paddle.Tensor) -> paddle.Tensor:
+        if self.causal:
+            avg = F.adaptive_avg_pool3d(x, (x.shape[2], 1, 1))
+            avg = self.cgap(avg)[:, :, -1:]
+        else:
+            avg = F.adaptive_avg_pool3d(x, 1)
+        return avg
+
+    @staticmethod
+    def _weight_init(m):
+        if isinstance(m, nn.Conv3D):
+            nn.initializer.KaimingNormal(m.weight)
+            if m.bias is not None:
+                nn.initializer.Constant(0.0)(m.bias)
+        elif isinstance(m, (nn.BatchNorm3D, nn.BatchNorm2D, nn.GroupNorm)):
+            nn.initializer.Constant(1.0)(m.weight)
+            nn.initializer.Constant(0.0)(m.bias)
+        elif isinstance(m, nn.Linear):
+            nn.initializer.Normal(m.weight, 0, 0.01)
+            nn.initializer.Constant(0.0)(m.bias)
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        x = self.conv1(x)
+        x = self.blocks(x)
+        x = self.conv7(x)
+        x = self.avg(x)
+        x = self.classifier(x)
+        x = x.flatten(1)
+        return x
+
+    @staticmethod
+    def _clean_activation_buffers(m):
+        if issubclass(type(m), CausalModule):
+            m.reset_activation()
+
+    def clean_activation_buffers(self) -> None:
+        self.apply(self._clean_activation_buffers)
+
+
+if __name__ == '__main__':
+    net = MoViNet(causal=False, conv_type='3d')
+    paddle.summary(net, input_size=(1, 3, 8, 224, 224))
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/ms_tcn.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/ms_tcn.py
new file mode 100644
index 0000000..fb49b9c
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/ms_tcn.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+import copy
+import random
+import math
+
+from paddle import ParamAttr
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+
+
+def _calculate_fan_in_and_fan_out(tensor):
+    dimensions = len(tensor.shape)
+    if dimensions < 2:
+        raise ValueError("Fan in and fan out can not be computed \
+        for tensor with fewer than 2 dimensions")
+
+    if dimensions == 2:  # Linear
+        fan_in = tensor.shape[1]
+        fan_out = tensor.shape[0]
+    else:
+        num_input_fmaps = tensor.shape[1]
+        num_output_fmaps = tensor.shape[0]
+        receptive_field_size = 1
+        if tensor.dim() > 2:
+            receptive_field_size = tensor[0][0].numel()
+        fan_in = num_input_fmaps * receptive_field_size
+        fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def calculate_gain(nonlinearity=None, a=None):
+    if nonlinearity == 'tanh':
+        return 5.0 / 3
+    elif nonlinearity == 'relu':
+        return math.sqrt(2.0)
+    elif nonlinearity == 'leaky_relu':
+        if a != None:
+            return math.sqrt(2.0 / (1 + a**2))
+        else:
+            return math.sqrt(2.0 / (1 + 0.01**2))
+    elif nonlinearity == 'selu':
+        return 3.0 / 4
+    else:
+        return 1
+
+
+def KaimingUniform_like_torch(weight_npy,
+                              mode='fan_in',
+                              nonlinearity='leaky_relu'):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy)
+    if mode == 'fan_in':
+        fan_mode = fan_in
+    else:
+        fan_mode = fan_out
+    a = math.sqrt(5.0)
+    gain = calculate_gain(nonlinearity=nonlinearity, a=a)
+    std = gain / math.sqrt(fan_mode)
+    bound = math.sqrt(3.0) * std
+    return np.random.uniform(-bound, bound, weight_npy.shape)
+
+
+def init_bias(weight_npy, bias_npy):
+    # attention this weight is not bias
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy)
+    bound = 1.0 / math.sqrt(fan_in)
+    return np.random.uniform(-bound, bound, bias_npy.shape)
+
+
+class SingleStageModel(nn.Layer):
+
+    def __init__(self, num_layers, num_f_maps, dim, num_classes):
+        super(SingleStageModel, self).__init__()
+        self.conv_in = nn.Conv1D(dim, num_f_maps, 1)
+        self.layers = nn.LayerList([
+            copy.deepcopy(DilatedResidualLayer(2**i, num_f_maps, num_f_maps))
+            for i in range(num_layers)
+        ])
+        self.conv_out = nn.Conv1D(num_f_maps, num_classes, 1)
+
+    def forward(self, x):
+        out = self.conv_in(x)
+        for layer in self.layers:
+            out = layer(out)
+        out = self.conv_out(out)
+        return out
+
+
+class DilatedResidualLayer(nn.Layer):
+
+    def __init__(self, dilation, in_channels, out_channels):
+        super(DilatedResidualLayer, self).__init__()
+        self.conv_dilated = nn.Conv1D(in_channels,
+                                      out_channels,
+                                      3,
+                                      padding=dilation,
+                                      dilation=dilation)
+        self.conv_in = nn.Conv1D(out_channels, out_channels, 1)
+        self.dropout = nn.Dropout()
+
+    def forward(self, x):
+        out = F.relu(self.conv_dilated(x))
+        out = self.conv_in(out)
+        out = self.dropout(out)
+        return (x + out)
+
+
+@BACKBONES.register()
+class MSTCN(nn.Layer):
+
+    def __init__(self, num_stages, num_layers, num_f_maps, dim, num_classes):
+        super().__init__()
+        self.stage1 = SingleStageModel(num_layers, num_f_maps, dim, num_classes)
+        self.stages = nn.LayerList([
+            copy.deepcopy(
+                SingleStageModel(num_layers, num_f_maps, num_classes,
+                                 num_classes)) for s in range(num_stages - 1)
+        ])
+
+    def forward(self, x):
+        """ MSTCN forward
+        """
+        out = self.stage1(x)
+        outputs = out.unsqueeze(0)
+        for s in self.stages:
+            out = s(F.softmax(out, axis=1))
+            outputs = paddle.concat((outputs, out.unsqueeze(0)), axis=0)
+        return outputs
+
+    def init_weights(self):
+        for layer in self.sublayers():
+            if isinstance(layer, nn.Conv1D):
+                layer.weight.set_value(
+                    KaimingUniform_like_torch(layer.weight).astype('float32'))
+                if layer.bias is not None:
+                    layer.bias.set_value(
+                        init_bias(layer.weight, layer.bias).astype('float32'))
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/pptsm_mv2.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/pptsm_mv2.py
new file mode 100644
index 0000000..28d045d
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/pptsm_mv2.py
@@ -0,0 +1,282 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils import load_ckpt
+
+# Download URL of pretrained model
+# {
+# "MobileNetV2":
+# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_ssld_pretrained.pdparams",
+
+# "MobileNetV2_x0_25":
+# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_25_pretrained.pdparams",
+# "MobileNetV2_x0_5":
+# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_5_pretrained.pdparams",
+# "MobileNetV2_x0_75":
+# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_75_pretrained.pdparams",
+# "MobileNetV2_x1_5":
+# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x1_5_pretrained.pdparams",
+# "MobileNetV2_x2_0":
+# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x2_0_pretrained.pdparams"
+# }
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 padding,
+                 channels=None,
+                 num_groups=1,
+                 name=None,
+                 use_cudnn=True):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(in_channels=num_channels,
+                            out_channels=num_filters,
+                            kernel_size=filter_size,
+                            stride=stride,
+                            padding=padding,
+                            groups=num_groups,
+                            weight_attr=ParamAttr(name=name + "_weights"),
+                            bias_attr=False)
+
+        self._batch_norm = BatchNorm(
+            num_filters,
+            param_attr=ParamAttr(name=name + "_bn_scale"),
+            bias_attr=ParamAttr(name=name + "_bn_offset"),
+            moving_mean_name=name + "_bn_mean",
+            moving_variance_name=name + "_bn_variance")
+
+    def forward(self, inputs, if_act=True):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if if_act:
+            y = F.relu6(y)
+        return y
+
+
+class InvertedResidualUnit(nn.Layer):
+    def __init__(self, num_channels, num_in_filter, num_filters, stride,
+                 filter_size, padding, expansion_factor, name, num_seg):
+        super(InvertedResidualUnit, self).__init__()
+        self.num_seg = num_seg
+        num_expfilter = int(round(num_in_filter * expansion_factor))
+        self._expand_conv = ConvBNLayer(num_channels=num_channels,
+                                        num_filters=num_expfilter,
+                                        filter_size=1,
+                                        stride=1,
+                                        padding=0,
+                                        num_groups=1,
+                                        name=name + "_expand")
+
+        self._bottleneck_conv = ConvBNLayer(num_channels=num_expfilter,
+                                            num_filters=num_expfilter,
+                                            filter_size=filter_size,
+                                            stride=stride,
+                                            padding=padding,
+                                            num_groups=num_expfilter,
+                                            use_cudnn=False,
+                                            name=name + "_dwise")
+
+        self._linear_conv = ConvBNLayer(num_channels=num_expfilter,
+                                        num_filters=num_filters,
+                                        filter_size=1,
+                                        stride=1,
+                                        padding=0,
+                                        num_groups=1,
+                                        name=name + "_linear")
+
+    def forward(self, inputs, ifshortcut):
+        # add temporal shift module
+        y = inputs
+        if ifshortcut:
+            y = F.temporal_shift(y, self.num_seg, 1.0 / self.num_seg)
+
+        y = self._expand_conv(y, if_act=True)
+        y = self._bottleneck_conv(y, if_act=True)
+        y = self._linear_conv(y, if_act=False)
+        if ifshortcut:
+            y = paddle.add(inputs, y)
+        return y
+
+
+class InvresiBlocks(nn.Layer):
+    def __init__(self, in_c, t, c, n, s, name, num_seg):
+        super(InvresiBlocks, self).__init__()
+
+        self._first_block = InvertedResidualUnit(num_channels=in_c,
+                                                 num_in_filter=in_c,
+                                                 num_filters=c,
+                                                 stride=s,
+                                                 filter_size=3,
+                                                 padding=1,
+                                                 expansion_factor=t,
+                                                 name=name + "_1",
+                                                 num_seg=num_seg)
+
+        self._block_list = []
+        for i in range(1, n):
+            block = self.add_sublayer(name + "_" + str(i + 1),
+                                      sublayer=InvertedResidualUnit(
+                                          num_channels=c,
+                                          num_in_filter=c,
+                                          num_filters=c,
+                                          stride=1,
+                                          filter_size=3,
+                                          padding=1,
+                                          expansion_factor=t,
+                                          name=name + "_" + str(i + 1),
+                                          num_seg=num_seg))
+            self._block_list.append(block)
+
+    def forward(self, inputs):
+        y = self._first_block(inputs, ifshortcut=False)
+        for block in self._block_list:
+            y = block(y, ifshortcut=True)
+        return y
+
+
+class MobileNet(nn.Layer):
+    def __init__(self,
+                 class_num=400,
+                 scale=1.0,
+                 pretrained=None,
+                 prefix_name="",
+                 num_seg=8):
+        super(MobileNet, self).__init__()
+        self.scale = scale
+        self.class_num = class_num
+        self.pretrained = pretrained
+        self.num_seg = num_seg
+
+        bottleneck_params_list = [
+            (1, 16, 1, 1),
+            (6, 24, 2, 2),
+            (6, 32, 3, 2),
+            (6, 64, 4, 2),
+            (6, 96, 3, 1),
+            (6, 160, 3, 2),
+            (6, 320, 1, 1),
+        ]
+
+        self.conv1 = ConvBNLayer(num_channels=3,
+                                 num_filters=int(32 * scale),
+                                 filter_size=3,
+                                 stride=2,
+                                 padding=1,
+                                 name=prefix_name + "conv1_1")
+
+        self.block_list = []
+        i = 1
+        in_c = int(32 * scale)
+        for layer_setting in bottleneck_params_list:
+            t, c, n, s = layer_setting
+            i += 1
+            block = self.add_sublayer(prefix_name + "conv" + str(i),
+                                      sublayer=InvresiBlocks(in_c=in_c,
+                                                             t=t,
+                                                             c=int(c * scale),
+                                                             n=n,
+                                                             s=s,
+                                                             name=prefix_name +
+                                                             "conv" + str(i),
+                                                             num_seg=num_seg))
+            self.block_list.append(block)
+            in_c = int(c * scale)
+
+        self.out_c = int(1280 * scale) if scale > 1.0 else 1280
+        self.conv9 = ConvBNLayer(num_channels=in_c,
+                                 num_filters=self.out_c,
+                                 filter_size=1,
+                                 stride=1,
+                                 padding=0,
+                                 name=prefix_name + "conv9")
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.out = Linear(self.out_c,
+                          class_num,
+                          weight_attr=ParamAttr(name=prefix_name +
+                                                "fc10_weights"),
+                          bias_attr=ParamAttr(name=prefix_name + "fc10_offset"))
+
+    def init_weights(self):
+        """Initiate the parameters.
+        """
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            for layer in self.sublayers():
+                if isinstance(layer, nn.Conv2D):
+                    weight_init_(layer, 'KaimingNormal')
+                elif isinstance(layer, nn.BatchNorm2D):
+                    weight_init_(layer, 'Constant', value=1)
+
+    def forward(self, inputs):
+        y = self.conv1(inputs, if_act=True)
+        for block in self.block_list:
+            y = block(y)
+        y = self.conv9(y, if_act=True)
+        y = self.pool2d_avg(y)
+
+        y = paddle.reshape(y, [-1, self.num_seg, y.shape[1]])
+        y = paddle.mean(y, axis=1)
+        y = paddle.reshape(y, shape=[-1, self.out_c])
+
+        y = self.out(y)
+        return y
+
+
+@BACKBONES.register()
+def PPTSM_MobileNetV2(pretrained=None, **kwargs):
+    model = MobileNet(pretrained=pretrained, scale=1.0, **kwargs)
+    return model
+
+
+def PPTSM_MobileNetV2_x0_25(pretrained=None, **kwargs):
+    model = MobileNet(pretrained=pretrained, scale=0.25, **kwargs)
+    return model
+
+
+def PPTSM_MobileNetV2_x0_5(pretrained=None, **kwargs):
+    model = MobileNet(pretrained=pretrained, scale=0.5, **kwargs)
+    return model
+
+
+def PPTSM_MobileNetV2_x0_75(pretrained=None, **kwargs):
+    model = MobileNet(pretrained=pretrained, scale=0.75, **kwargs)
+    return model
+
+
+def PPTSM_MobileNetV2_x1_5(pretrained=None, **kwargs):
+    model = MobileNet(pretrained=pretrained, scale=1.5, **kwargs)
+    return model
+
+
+def PPTSM_MobileNetV2_x2_0(pretrained=None, **kwargs):
+    model = MobileNet(pretrained=pretrained, scale=2.0, **kwargs)
+    return model
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/pptsm_mv3.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/pptsm_mv3.py
new file mode 100644
index 0000000..cd10bac
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/pptsm_mv3.py
@@ -0,0 +1,408 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1905.02244
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Dropout, Linear
+from paddle.regularizer import L2Decay
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils import load_ckpt
+
+# Download URL of pretrained model
+# MODEL_URLS = {
+#     "MobileNetV3_small_x1_0":
+#     "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x1_0_ssld_pretrained.pdparams",
+#     "MobileNetV3_large_x1_0":
+#     "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x1_0_ssld_pretrained.pdparams",
+# }
+
+MODEL_STAGES_PATTERN = {
+    "MobileNetV3_small": ["blocks[0]", "blocks[2]", "blocks[7]", "blocks[10]"],
+    "MobileNetV3_large":
+    ["blocks[0]", "blocks[2]", "blocks[5]", "blocks[11]", "blocks[14]"]
+}
+
+# "large", "small" is just for MobinetV3_large, MobileNetV3_small respectively.
+# The type of "large" or "small" config is a list. Each element(list) represents a depthwise block, which is composed of k, exp, se, act, s.
+# k: kernel_size
+# exp: middle channel number in depthwise block
+# c: output channel number in depthwise block
+# se: whether to use SE block
+# act: which activation to use
+# s: stride in depthwise block
+NET_CONFIG = {
+    "large": [
+        # k, exp, c, se, act, s
+        [3, 16, 16, False, "relu", 1],
+        [3, 64, 24, False, "relu", 2],
+        [3, 72, 24, False, "relu", 1],
+        [5, 72, 40, True, "relu", 2],
+        [5, 120, 40, True, "relu", 1],
+        [5, 120, 40, True, "relu", 1],
+        [3, 240, 80, False, "hardswish", 2],
+        [3, 200, 80, False, "hardswish", 1],
+        [3, 184, 80, False, "hardswish", 1],
+        [3, 184, 80, False, "hardswish", 1],
+        [3, 480, 112, True, "hardswish", 1],
+        [3, 672, 112, True, "hardswish", 1],
+        [5, 672, 160, True, "hardswish", 2],
+        [5, 960, 160, True, "hardswish", 1],
+        [5, 960, 160, True, "hardswish", 1],
+    ],
+    "small": [
+        # k, exp, c, se, act, s
+        [3, 16, 16, True, "relu", 2],
+        [3, 72, 24, False, "relu", 2],
+        [3, 88, 24, False, "relu", 1],
+        [5, 96, 40, True, "hardswish", 2],
+        [5, 240, 40, True, "hardswish", 1],
+        [5, 240, 40, True, "hardswish", 1],
+        [5, 120, 48, True, "hardswish", 1],
+        [5, 144, 48, True, "hardswish", 1],
+        [5, 288, 96, True, "hardswish", 2],
+        [5, 576, 96, True, "hardswish", 1],
+        [5, 576, 96, True, "hardswish", 1],
+    ]
+}
+# first conv output channel number in MobileNetV3
+STEM_CONV_NUMBER = 16
+# last second conv output channel for "small"
+LAST_SECOND_CONV_SMALL = 576
+# last second conv output channel for "large"
+LAST_SECOND_CONV_LARGE = 960
+# last conv output channel number for "large" and "small"
+LAST_CONV = 1280
+
+
+def _make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+def _create_act(act):
+    if act == "hardswish":
+        return nn.Hardswish()
+    elif act == "relu":
+        return nn.ReLU()
+    elif act is None:
+        return None
+    else:
+        raise RuntimeError(
+            "The activation function is not supported: {}".format(act))
+
+
+class MobileNetV3(nn.Layer):
+    """
+    MobileNetV3
+    Args:
+        config: list. MobileNetV3 depthwise blocks config.
+        scale: float=1.0. The coefficient that controls the size of network parameters.
+        class_num: int=1000. The number of classes.
+        inplanes: int=16. The output channel number of first convolution layer.
+        class_squeeze: int=960. The output channel number of penultimate convolution layer.
+        class_expand: int=1280. The output channel number of last convolution layer.
+        dropout_prob: float=0.2.  Probability of setting units to zero.
+    Returns:
+        model: nn.Layer. Specific MobileNetV3 model depends on args.
+    """
+    def __init__(self,
+                 config,
+                 stages_pattern,
+                 scale=1.0,
+                 class_num=400,
+                 inplanes=STEM_CONV_NUMBER,
+                 class_squeeze=LAST_SECOND_CONV_LARGE,
+                 class_expand=LAST_CONV,
+                 dropout_prob=0.2,
+                 num_seg=8,
+                 pretrained=None,
+                 return_patterns=None,
+                 return_stages=None):
+        super().__init__()
+
+        self.cfg = config
+        self.scale = scale
+        self.inplanes = inplanes
+        self.class_squeeze = class_squeeze
+        self.class_expand = class_expand
+        self.class_num = class_num
+        self.num_seg = num_seg
+        self.pretrained = pretrained
+
+        self.conv = ConvBNLayer(in_c=3,
+                                out_c=_make_divisible(self.inplanes *
+                                                      self.scale),
+                                filter_size=3,
+                                stride=2,
+                                padding=1,
+                                num_groups=1,
+                                if_act=True,
+                                act="hardswish")
+
+        self.blocks = nn.Sequential(*[
+            ResidualUnit(in_c=_make_divisible(self.inplanes * self.scale if i ==
+                                              0 else self.cfg[i - 1][2] *
+                                              self.scale),
+                         mid_c=_make_divisible(self.scale * exp),
+                         out_c=_make_divisible(self.scale * c),
+                         filter_size=k,
+                         stride=s,
+                         use_se=se,
+                         num_seg=self.num_seg,
+                         act=act)
+            for i, (k, exp, c, se, act, s) in enumerate(self.cfg)
+        ])
+
+        self.last_second_conv = ConvBNLayer(
+            in_c=_make_divisible(self.cfg[-1][2] * self.scale),
+            out_c=_make_divisible(self.scale * self.class_squeeze),
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=True,
+            act="hardswish")
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+
+        self.last_conv = Conv2D(in_channels=_make_divisible(self.scale *
+                                                            self.class_squeeze),
+                                out_channels=self.class_expand,
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias_attr=False)
+
+        self.hardswish = nn.Hardswish()
+        if dropout_prob is not None:
+            self.dropout = Dropout(p=dropout_prob, mode="downscale_in_infer")
+        else:
+            self.dropout = None
+
+        self.fc = Linear(self.class_expand, class_num)
+
+    def init_weights(self):
+        """Initiate the parameters.
+        """
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            for layer in self.sublayers():
+                if isinstance(layer, nn.Conv2D):
+                    #XXX: no bias
+                    weight_init_(layer, 'KaimingNormal')
+                elif isinstance(layer, nn.BatchNorm2D):
+                    weight_init_(layer, 'Constant', value=1)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.blocks(x)
+        x = self.last_second_conv(x)
+        x = self.avg_pool(x)
+        x = self.last_conv(x)
+        x = self.hardswish(x)
+        if self.dropout is not None:
+            x = self.dropout(x)
+
+        # feature aggregation for video
+        x = paddle.reshape(x, [-1, self.num_seg, x.shape[1]])
+        x = paddle.mean(x, axis=1)
+        x = paddle.reshape(x, shape=[-1, self.class_expand])
+
+        x = self.fc(x)
+
+        return x
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 if_act=True,
+                 act=None):
+        super().__init__()
+
+        self.conv = Conv2D(in_channels=in_c,
+                           out_channels=out_c,
+                           kernel_size=filter_size,
+                           stride=stride,
+                           padding=padding,
+                           groups=num_groups,
+                           bias_attr=False)
+        self.bn = BatchNorm(num_channels=out_c,
+                            act=None,
+                            param_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.if_act = if_act
+        self.act = _create_act(act)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.if_act:
+            x = self.act(x)
+        return x
+
+
+class ResidualUnit(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 mid_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 use_se,
+                 num_seg=8,
+                 act=None):
+        super().__init__()
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.if_se = use_se
+        self.num_seg = num_seg
+
+        self.expand_conv = ConvBNLayer(in_c=in_c,
+                                       out_c=mid_c,
+                                       filter_size=1,
+                                       stride=1,
+                                       padding=0,
+                                       if_act=True,
+                                       act=act)
+        self.bottleneck_conv = ConvBNLayer(in_c=mid_c,
+                                           out_c=mid_c,
+                                           filter_size=filter_size,
+                                           stride=stride,
+                                           padding=int((filter_size - 1) // 2),
+                                           num_groups=mid_c,
+                                           if_act=True,
+                                           act=act)
+        if self.if_se:
+            self.mid_se = SEModule(mid_c)
+        self.linear_conv = ConvBNLayer(in_c=mid_c,
+                                       out_c=out_c,
+                                       filter_size=1,
+                                       stride=1,
+                                       padding=0,
+                                       if_act=False,
+                                       act=None)
+
+    def forward(self, x):
+        identity = x
+
+        if self.if_shortcut:
+            x = F.temporal_shift(x, self.num_seg, 1.0 / self.num_seg)
+
+        x = self.expand_conv(x)
+        x = self.bottleneck_conv(x)
+        if self.if_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.if_shortcut:
+            x = paddle.add(identity, x)
+        return x
+
+
+# nn.Hardsigmoid can't transfer "slope" and "offset" in nn.functional.hardsigmoid
+class Hardsigmoid(nn.Layer):
+    def __init__(self, slope=0.2, offset=0.5):
+        super().__init__()
+        self.slope = slope
+        self.offset = offset
+
+    def forward(self, x):
+        return nn.functional.hardsigmoid(x,
+                                         slope=self.slope,
+                                         offset=self.offset)
+
+
+class SEModule(nn.Layer):
+    def __init__(self, channel, reduction=4):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(in_channels=channel,
+                            out_channels=channel // reduction,
+                            kernel_size=1,
+                            stride=1,
+                            padding=0)
+        self.relu = nn.ReLU()
+        self.conv2 = Conv2D(in_channels=channel // reduction,
+                            out_channels=channel,
+                            kernel_size=1,
+                            stride=1,
+                            padding=0)
+        self.hardsigmoid = Hardsigmoid(slope=0.2, offset=0.5)
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        return paddle.multiply(x=identity, y=x)
+
+
+def PPTSM_MobileNetV3_small_x1_0(pretrained=None, **kwargs):
+    """
+    MobileNetV3_small_x1_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x1_0` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        pretrained=pretrained,
+        **kwargs)
+    return model
+
+
+@BACKBONES.register()
+def PPTSM_MobileNetV3(pretrained=None, **kwargs):
+    """
+    MobileNetV3_large_x1_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x1_0` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"],
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        pretrained=pretrained,
+        **kwargs)
+    return model
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/pptsm_v2.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/pptsm_v2.py
new file mode 100644
index 0000000..07dc5bf
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/pptsm_v2.py
@@ -0,0 +1,405 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Dropout, Linear, BatchNorm2D
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import KaimingNormal
+import paddle.nn.functional as F
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils import load_ckpt
+
+# MODEL_URLS = {
+#     "PPLCNetV2":
+#     "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNetV2_base_ssld_pretrained.pdparams",
+# }
+
+MODEL_STAGES_PATTERN = {
+    "PPLCNet": ["blocks2", "blocks3", "blocks4", "blocks5", "blocks6"]
+}
+
+NET_CONFIG = {
+    # in_channels, kernel_size, split_pw, use_rep, use_se, use_shortcut
+    "stage1": [64, 3, False, False, False, False],
+    "stage2": [128, 3, False, False, False, False],
+    "stage3": [256, 5, True, True, True, False],
+    "stage4": [512, 5, False, True, False, True],
+}
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class GlobalAttention(nn.Layer):
+    """
+    Lightweight temporal attention module.
+    """
+
+    def __init__(self, num_seg=8):
+        super().__init__()
+        self.fc = nn.Linear(in_features=num_seg,
+                            out_features=num_seg,
+                            weight_attr=ParamAttr(learning_rate=5.0,
+                                                  regularizer=L2Decay(1e-4)),
+                            bias_attr=ParamAttr(learning_rate=10.0,
+                                                regularizer=L2Decay(0.0)))
+        self.num_seg = num_seg
+
+    def forward(self, x):
+        _, C, H, W = x.shape
+        x0 = x
+
+        x = x.reshape([-1, self.num_seg, C * H * W])
+        x = paddle.mean(x, axis=2)  # efficient way of avg_pool
+        x = x.squeeze(axis=-1)
+        x = self.fc(x)
+        attention = F.sigmoid(x)
+        attention = attention.reshape(
+            (-1, self.num_seg, 1, 1, 1))  #for broadcast
+
+        x0 = x0.reshape([-1, self.num_seg, C, H, W])
+        y = paddle.multiply(x0, attention)
+        y = y.reshape_([-1, C, H, W])
+        return y
+
+
+class ConvBNLayer(nn.Layer):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 groups=1,
+                 use_act=True):
+        super().__init__()
+        self.use_act = use_act
+        self.conv = Conv2D(in_channels=in_channels,
+                           out_channels=out_channels,
+                           kernel_size=kernel_size,
+                           stride=stride,
+                           padding=(kernel_size - 1) // 2,
+                           groups=groups,
+                           weight_attr=ParamAttr(initializer=KaimingNormal()),
+                           bias_attr=False)
+
+        self.bn = BatchNorm2D(out_channels,
+                              weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                              bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        if self.use_act:
+            self.act = nn.ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.use_act:
+            x = self.act(x)
+        return x
+
+
+class SEModule(nn.Layer):
+
+    def __init__(self, channel, reduction=4):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(in_channels=channel,
+                            out_channels=channel // reduction,
+                            kernel_size=1,
+                            stride=1,
+                            padding=0)
+        self.relu = nn.ReLU()
+        self.conv2 = Conv2D(in_channels=channel // reduction,
+                            out_channels=channel,
+                            kernel_size=1,
+                            stride=1,
+                            padding=0)
+        self.hardsigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        x = paddle.multiply(x=identity, y=x)
+        return x
+
+
+class RepDepthwiseSeparable(nn.Layer):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 dw_size=3,
+                 split_pw=False,
+                 use_rep=False,
+                 use_se=False,
+                 use_shortcut=False):
+        super().__init__()
+        self.is_repped = False
+
+        self.dw_size = dw_size
+        self.split_pw = split_pw
+        self.use_rep = use_rep
+        self.use_se = use_se
+        self.use_shortcut = True if use_shortcut and stride == 1 and in_channels == out_channels else False
+
+        if self.use_rep:
+            self.dw_conv_list = nn.LayerList()
+            for kernel_size in range(self.dw_size, 0, -2):
+                if kernel_size == 1 and stride != 1:
+                    continue
+                dw_conv = ConvBNLayer(in_channels=in_channels,
+                                      out_channels=in_channels,
+                                      kernel_size=kernel_size,
+                                      stride=stride,
+                                      groups=in_channels,
+                                      use_act=False)
+                self.dw_conv_list.append(dw_conv)
+            self.dw_conv = nn.Conv2D(in_channels=in_channels,
+                                     out_channels=in_channels,
+                                     kernel_size=dw_size,
+                                     stride=stride,
+                                     padding=(dw_size - 1) // 2,
+                                     groups=in_channels)
+        else:
+            self.dw_conv = ConvBNLayer(in_channels=in_channels,
+                                       out_channels=in_channels,
+                                       kernel_size=dw_size,
+                                       stride=stride,
+                                       groups=in_channels)
+
+        self.act = nn.ReLU()
+
+        if use_se:
+            self.se = SEModule(in_channels)
+
+        if self.split_pw:
+            pw_ratio = 0.5
+            self.pw_conv_1 = ConvBNLayer(in_channels=in_channels,
+                                         kernel_size=1,
+                                         out_channels=int(out_channels *
+                                                          pw_ratio),
+                                         stride=1)
+            self.pw_conv_2 = ConvBNLayer(in_channels=int(out_channels *
+                                                         pw_ratio),
+                                         kernel_size=1,
+                                         out_channels=out_channels,
+                                         stride=1)
+        else:
+            self.pw_conv = ConvBNLayer(in_channels=in_channels,
+                                       kernel_size=1,
+                                       out_channels=out_channels,
+                                       stride=1)
+
+    def forward(self, x):
+        if self.use_rep:
+            input_x = x
+            if self.is_repped:
+                x = self.act(self.dw_conv(x))
+            else:
+                y = self.dw_conv_list[0](x)
+                for dw_conv in self.dw_conv_list[1:]:
+                    y += dw_conv(x)
+                x = self.act(y)
+        else:
+            x = self.dw_conv(x)
+
+        if self.use_se:
+            x = self.se(x)
+        if self.split_pw:
+            x = self.pw_conv_1(x)
+            x = self.pw_conv_2(x)
+        else:
+            x = self.pw_conv(x)
+        if self.use_shortcut:
+            x = x + input_x
+        return x
+
+    def rep(self):
+        if self.use_rep:
+            self.is_repped = True
+            kernel, bias = self._get_equivalent_kernel_bias()
+            self.dw_conv.weight.set_value(kernel)
+            self.dw_conv.bias.set_value(bias)
+
+    def _get_equivalent_kernel_bias(self):
+        kernel_sum = 0
+        bias_sum = 0
+        for dw_conv in self.dw_conv_list:
+            kernel, bias = self._fuse_bn_tensor(dw_conv)
+            kernel = self._pad_tensor(kernel, to_size=self.dw_size)
+            kernel_sum += kernel
+            bias_sum += bias
+        return kernel_sum, bias_sum
+
+    def _fuse_bn_tensor(self, branch):
+        kernel = branch.conv.weight
+        running_mean = branch.bn._mean
+        running_var = branch.bn._variance
+        gamma = branch.bn.weight
+        beta = branch.bn.bias
+        eps = branch.bn._epsilon
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape((-1, 1, 1, 1))
+        return kernel * t, beta - running_mean * gamma / std
+
+    def _pad_tensor(self, tensor, to_size):
+        from_size = tensor.shape[-1]
+        if from_size == to_size:
+            return tensor
+        pad = (to_size - from_size) // 2
+        return F.pad(tensor, [pad, pad, pad, pad])
+
+
+class PPTSM_v2_LCNet(nn.Layer):
+
+    def __init__(self,
+                 scale,
+                 depths,
+                 class_num=400,
+                 dropout_prob=0,
+                 num_seg=8,
+                 use_temporal_att=False,
+                 pretrained=None,
+                 use_last_conv=True,
+                 class_expand=1280):
+        super().__init__()
+        self.scale = scale
+        self.use_last_conv = use_last_conv
+        self.class_expand = class_expand
+        self.num_seg = num_seg
+        self.use_temporal_att = use_temporal_att
+        self.pretrained = pretrained
+
+        self.stem = nn.Sequential(*[
+            ConvBNLayer(in_channels=3,
+                        kernel_size=3,
+                        out_channels=make_divisible(32 * scale),
+                        stride=2),
+            RepDepthwiseSeparable(in_channels=make_divisible(32 * scale),
+                                  out_channels=make_divisible(64 * scale),
+                                  stride=1,
+                                  dw_size=3)
+        ])
+
+        # stages
+        self.stages = nn.LayerList()
+        for depth_idx, k in enumerate(NET_CONFIG):
+            in_channels, kernel_size, split_pw, use_rep, use_se, use_shortcut = NET_CONFIG[
+                k]
+            self.stages.append(
+                nn.Sequential(*[
+                    RepDepthwiseSeparable(in_channels=make_divisible(
+                        (in_channels if i == 0 else in_channels * 2) * scale),
+                                          out_channels=make_divisible(
+                                              in_channels * 2 * scale),
+                                          stride=2 if i == 0 else 1,
+                                          dw_size=kernel_size,
+                                          split_pw=split_pw,
+                                          use_rep=use_rep,
+                                          use_se=use_se,
+                                          use_shortcut=use_shortcut)
+                    for i in range(depths[depth_idx])
+                ]))
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+
+        if self.use_last_conv:
+            self.last_conv = Conv2D(in_channels=make_divisible(
+                NET_CONFIG["stage4"][0] * 2 * scale),
+                                    out_channels=self.class_expand,
+                                    kernel_size=1,
+                                    stride=1,
+                                    padding=0,
+                                    bias_attr=False)
+            self.act = nn.ReLU()
+            self.dropout = Dropout(p=dropout_prob, mode="downscale_in_infer")
+
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+        in_features = self.class_expand if self.use_last_conv else NET_CONFIG[
+            "stage4"][0] * 2 * scale
+        self.fc = Linear(in_features, class_num)
+        if self.use_temporal_att:
+            self.global_attention = GlobalAttention(num_seg=self.num_seg)
+
+    def init_weights(self):
+        """Initiate the parameters.
+        """
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            for layer in self.sublayers():
+                if isinstance(layer, nn.Conv2D):
+                    weight_init_(layer, 'KaimingNormal')
+                elif isinstance(layer, nn.BatchNorm2D):
+                    weight_init_(layer, 'Constant', value=1)
+
+    def forward(self, x):
+        x = self.stem(x)
+        count = 0
+        for stage in self.stages:
+            # only add temporal attention and tsm in stage3 for efficiency
+            if count == 2:
+                # add temporal attention
+                if self.use_temporal_att:
+                    x = self.global_attention(x)
+                x = F.temporal_shift(x, self.num_seg, 1.0 / self.num_seg)
+            count += 1
+            x = stage(x)
+
+        x = self.avg_pool(x)
+        if self.use_last_conv:
+            x = self.last_conv(x)
+            x = self.act(x)
+            x = self.dropout(x)
+
+        # Feature aggregation
+        x = paddle.reshape(x, [-1, self.num_seg, x.shape[1]])
+        x = paddle.mean(x, axis=1)
+        x = paddle.reshape(x, shape=[-1, self.class_expand])
+
+        x = self.fc(x)
+        return x
+
+
+@BACKBONES.register()
+def PPTSM_v2(pretrained=None, use_ssld=False, **kwargs):
+    """
+    PP-TSM_v2 model.
+    Args:
+        pretrained: str, means the path of the pretrained model.
+    Returns:
+        model: nn.Layer.
+    """
+    model = PPTSM_v2_LCNet(pretrained=pretrained,
+                           scale=1.0,
+                           depths=[2, 2, 6, 2],
+                           dropout_prob=0.2,
+                           **kwargs)
+    return model
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet.py
new file mode 100644
index 0000000..2f07991
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet.py
@@ -0,0 +1,283 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import math
+
+import paddle
+import paddle.nn as nn
+from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,
+                       AvgPool2D)
+from paddle import ParamAttr
+import paddle.nn.functional as F
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils import load_ckpt
+
+
+class ConvBNLayer(nn.Layer):
+    """Conv2D and BatchNorm2D layer.
+
+    Args:
+        in_channels (int): Number of channels for the input.
+        out_channels (int): Number of channels for the output.
+        kernel_size (int): Kernel size.
+        stride (int): Stride in the Conv2D layer. Default: 1.
+        groups (int): Groups in the Conv2D, Default: 1.
+        act (str): Indicate activation after BatchNorm2D layer.
+        name (str): the name of an instance of ConvBNLayer.
+
+    Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method.
+
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(in_channels=in_channels,
+                            out_channels=out_channels,
+                            kernel_size=kernel_size,
+                            stride=stride,
+                            padding=(kernel_size - 1) // 2,
+                            groups=groups,
+                            weight_attr=ParamAttr(name=name + "_weights"),
+                            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+
+        self._act = act
+
+        self._batch_norm = BatchNorm2D(out_channels,
+                                       weight_attr=ParamAttr(name=bn_name +
+                                                             "_scale"),
+                                       bias_attr=ParamAttr(bn_name + "_offset"))
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if self._act:
+            y = getattr(paddle.nn.functional, self._act)(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=1,
+                                 act="relu",
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 stride=stride,
+                                 act="relu",
+                                 name=name + "_branch2b")
+
+        self.conv2 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels * 4,
+                                 kernel_size=1,
+                                 act=None,
+                                 name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels * 4,
+                                     kernel_size=1,
+                                     stride=stride,
+                                     name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        return F.relu(y)
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 filter_size=3,
+                                 stride=stride,
+                                 act="relu",
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 filter_size=3,
+                                 act=None,
+                                 name=name + "_branch2b")
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels,
+                                     filter_size=1,
+                                     stride=stride,
+                                     name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(short, conv1)
+        y = F.relu(y)
+        return y
+
+
+@BACKBONES.register()
+class ResNet(nn.Layer):
+    """ResNet backbone.
+
+    Args:
+        depth (int): Depth of resnet model.
+        pretrained (str): pretrained model. Default: None.
+    """
+    def __init__(self, depth, pretrained=None):
+        super(ResNet, self).__init__()
+        self.pretrained = pretrained
+        self.layers = depth
+
+        supported_layers = [18, 34, 50, 101, 152]
+        assert self.layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, self.layers)
+
+        if self.layers == 18:
+            depth = [2, 2, 2, 2]
+        elif self.layers == 34 or self.layers == 50:
+            depth = [3, 4, 6, 3]
+        elif self.layers == 101:
+            depth = [3, 4, 23, 3]
+        elif self.layers == 152:
+            depth = [3, 8, 36, 3]
+
+        in_channels = [64, 256, 512, 1024]
+        out_channels = [64, 128, 256, 512]
+
+        self.conv = ConvBNLayer(in_channels=3,
+                                out_channels=64,
+                                kernel_size=7,
+                                stride=2,
+                                act="relu",
+                                name="conv1")
+        self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        if self.layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if self.layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        conv_name,
+                        BottleneckBlock(
+                            # NOTE: Be careful! Here is different from TSM model.
+                            in_channels=in_channels[block]
+                            if i == 0 else out_channels[block] * 4,
+                            out_channels=out_channels[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            name=conv_name))
+
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        conv_name,
+                        BasicBlock(in_channels=in_channels[block]
+                                   if i == 0 else out_channels[block],
+                                   out_channels=out_channels[block],
+                                   stride=2 if i == 0 and block != 0 else 1,
+                                   shortcut=shortcut,
+                                   name=conv_name))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+    def init_weights(self):
+        """Initiate the parameters.
+        Note:
+            1. when indicate pretrained loading path, will load it to initiate backbone.
+            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.
+            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html
+        """
+        #XXX: check bias!!! check pretrained!!!
+
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            for layer in self.sublayers():
+                if isinstance(layer, nn.Conv2D):
+                    #XXX: no bias
+                    weight_init_(layer, 'KaimingNormal')
+                elif isinstance(layer, nn.BatchNorm2D):
+                    weight_init_(layer, 'Constant', value=1)
+
+    def forward(self, inputs):
+        """Define how the backbone is going to run.
+
+        """
+        #NOTE: Already merge axis 0(batches) and axis 1(channels) before extracting feature phase,
+        # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27
+        #y = paddle.reshape(
+        #    inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])
+
+        y = self.conv(inputs)
+        y = self.pool2D_max(y)
+        for block in self.block_list:
+            y = block(y)
+        return y
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet3d.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet3d.py
new file mode 100644
index 0000000..33edefe
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet3d.py
@@ -0,0 +1,641 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+import collections
+from itertools import repeat
+
+import paddle
+from paddle import nn
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return tuple(x)
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+_triple = _ntuple(3)
+
+
+class ConvBNLayer(nn.Layer):
+    """A conv block that bundles conv/norm/activation layers.
+
+        This block simplifies the usage of convolution layers, which are commonly
+        used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
+        It is based upon three build methods: `build_conv_layer()`,
+        `build_norm_layer()` and `build_activation_layer()`.
+
+        Besides, we add some additional features in this module.
+        1. Automatically set `bias` of the conv layer.
+        2. Spectral norm is supported.
+        3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only
+        supports zero and circular padding, and we add "reflect" padding mode.
+
+        Args:
+            in_channels (int): Number of channels in the input feature map.
+                Same as that in ``nn._ConvNd``.
+            out_channels (int): Number of channels produced by the convolution.
+                Same as that in ``nn._ConvNd``.
+            kernel_size (int | tuple[int]): Size of the convolving kernel.
+                Same as that in ``nn._ConvNd``.
+            stride (int | tuple[int]): Stride of the convolution.
+                Same as that in ``nn._ConvNd``.
+            padding (int | tuple[int]): Zero-padding added to both sides of
+                the input. Same as that in ``nn._ConvNd``.
+            dilation (int | tuple[int]): Spacing between kernel elements.
+                Same as that in ``nn._ConvNd``.
+            groups (int): Number of blocked connections from input channels to
+                output channels. Same as that in ``nn._ConvNd``.
+        """
+
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            padding=0,
+            stride=1,
+            dilation=1,
+            groups=1,
+            act=None,
+            bias=None,
+    ):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = nn.Conv3D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias_attr=bias)
+
+        self._batch_norm = nn.BatchNorm3D(out_channels, momentum=0.1)
+        self.act = act
+        if act is not None:
+            self._act_op = nn.ReLU()
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if self.act is not None:
+            y = self._act_op(y)
+
+        return y
+
+
+class Bottleneck3d(nn.Layer):
+    """Bottleneck 3d block for ResNet3D.
+
+    Args:
+        inplanes (int): Number of channels for the input in first conv3d layer.
+        planes (int): Number of channels produced by some norm/conv3d layers.
+        spatial_stride (int): Spatial stride in the conv3d layer. Default: 1.
+        temporal_stride (int): Temporal stride in the conv3d layer. Default: 1.
+        dilation (int): Spacing between kernel elements. Default: 1.
+        downsample (nn.Module | None): Downsample layer. Default: None.
+        inflate (bool): Whether to inflate kernel. Default: True.
+        inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the
+            kernel sizes and padding strides for conv1 and conv2 in each block.
+            Default: '3x1x1'.
+        non_local (bool): Determine whether to apply non-local module in this
+            block. Default: False.
+        non_local_cfg (dict): Config for non-local module. Default: ``dict()``.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: ``dict(type='Conv3d')``.
+        norm_cfg (dict): Config for norm layers. required keys are ``type``,
+            Default: ``dict(type='BN3d')``.
+        act_cfg (dict): Config dict for activation layer.
+            Default: ``dict(type='ReLU')``.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 spatial_stride=1,
+                 temporal_stride=1,
+                 dilation=1,
+                 downsample=None,
+                 inflate=True,
+                 inflate_style='3x1x1',
+                 non_local=False,
+                 non_local_cfg=dict(),
+                 conv_cfg=dict(type='Conv3d'),
+                 norm_cfg=dict(type='BN3d'),
+                 act_cfg=dict(type='ReLU'),
+                 with_cp=False):
+        super().__init__()
+        assert inflate_style in ['3x1x1', '3x3x3']
+
+        self.inplanes = inplanes
+        self.planes = planes
+        self.spatial_stride = spatial_stride
+        self.temporal_stride = temporal_stride
+        self.dilation = dilation
+        self.inflate = inflate
+        self.inflate_style = inflate_style
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+        self.act_cfg = act_cfg
+        self.with_cp = with_cp
+        self.non_local = non_local
+        self.non_local_cfg = non_local_cfg
+
+        self.conv1_stride_s = 1
+        self.conv2_stride_s = spatial_stride
+        self.conv1_stride_t = 1
+        self.conv2_stride_t = temporal_stride
+
+        if self.inflate:
+            if inflate_style == '3x1x1':
+                conv1_kernel_size = (3, 1, 1)
+                conv1_padding = (1, 0, 0)
+                conv2_kernel_size = (1, 3, 3)
+                conv2_padding = (0, dilation, dilation)
+            else:
+                conv1_kernel_size = (1, 1, 1)
+                conv1_padding = (0, 0, 0)
+                conv2_kernel_size = (3, 3, 3)
+                conv2_padding = (1, dilation, dilation)
+        else:
+            conv1_kernel_size = (1, 1, 1)
+            conv1_padding = (0, 0, 0)
+            conv2_kernel_size = (1, 3, 3)
+            conv2_padding = (0, dilation, dilation)
+        self.conv1 = ConvBNLayer(
+            in_channels=inplanes,
+            out_channels=planes,
+            kernel_size=conv1_kernel_size,
+            stride=(self.conv1_stride_t, self.conv1_stride_s,
+                    self.conv1_stride_s),
+            padding=conv1_padding,
+            bias=False,
+            act='relu')
+
+        self.conv2 = ConvBNLayer(
+            in_channels=planes,
+            out_channels=planes,
+            kernel_size=conv2_kernel_size,
+            stride=(self.conv2_stride_t, self.conv2_stride_s,
+                    self.conv2_stride_s),
+            padding=conv2_padding,
+            dilation=(1, dilation, dilation),
+            bias=False,
+            act='relu')
+
+        self.conv3 = ConvBNLayer(
+            in_channels=planes,
+            out_channels=planes * self.expansion,
+            kernel_size=1,
+            bias=False,
+            act=None,
+        )
+
+        self.downsample = downsample
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        """Defines the computation performed at every call."""
+
+        def _inner_forward(x):
+            """Forward wrapper for utilizing checkpoint."""
+            identity = x
+
+            out = self.conv1(x)
+            out = self.conv2(out)
+            out = self.conv3(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out = out + identity
+            return out
+
+        out = _inner_forward(x)
+        out = self.relu(out)
+
+        if self.non_local:
+            out = self.non_local_block(out)
+
+        return out
+
+
+class ResNet3d(nn.Layer):
+    """ResNet 3d backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        pretrained (str | None): Name of pretrained model.
+        stage_blocks (tuple | None): Set number of stages for each res layer.
+            Default: None.
+        pretrained2d (bool): Whether to load pretrained 2D model.
+            Default: True.
+        in_channels (int): Channel num of input features. Default: 3.
+        base_channels (int): Channel num of stem output features. Default: 64.
+        out_indices (Sequence[int]): Indices of output feature. Default: (3, ).
+        num_stages (int): Resnet stages. Default: 4.
+        spatial_strides (Sequence[int]):
+            Spatial strides of residual blocks of each stage.
+            Default: ``(1, 2, 2, 2)``.
+        temporal_strides (Sequence[int]):
+            Temporal strides of residual blocks of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        conv1_kernel (Sequence[int]): Kernel size of the first conv layer.
+            Default: ``(3, 7, 7)``.
+        conv1_stride_s (int): Spatial stride of the first conv layer.
+            Default: 2.
+        conv1_stride_t (int): Temporal stride of the first conv layer.
+            Default: 1.
+        pool1_stride_s (int): Spatial stride of the first pooling layer.
+            Default: 2.
+        pool1_stride_t (int): Temporal stride of the first pooling layer.
+            Default: 1.
+        with_pool2 (bool): Whether to use pool2. Default: True.
+        inflate (Sequence[int]): Inflate Dims of each block.
+            Default: (1, 1, 1, 1).
+        inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the
+            kernel sizes and padding strides for conv1 and conv2 in each block.
+            Default: '3x1x1'.
+        conv_cfg (dict): Config for conv layers. required keys are ``type``
+            Default: ``dict(type='Conv3d')``.
+        norm_cfg (dict): Config for norm layers. required keys are ``type`` and
+            ``requires_grad``.
+            Default: ``dict(type='BN3d', requires_grad=True)``.
+        act_cfg (dict): Config dict for activation layer.
+            Default: ``dict(type='ReLU', inplace=True)``.
+        norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze
+            running stats (mean and var). Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        non_local (Sequence[int]): Determine whether to apply non-local module
+            in the corresponding block of each stages. Default: (0, 0, 0, 0).
+        non_local_cfg (dict): Config for non-local module. Default: ``dict()``.
+        zero_init_residual (bool):
+            Whether to use zero initialization for residual block,
+            Default: True.
+        kwargs (dict, optional): Key arguments for "make_res_layer".
+    """
+
+    arch_settings = {
+        50: (Bottleneck3d, (3, 4, 6, 3)),
+        101: (Bottleneck3d, (3, 4, 23, 3)),
+        152: (Bottleneck3d, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth,
+                 stage_blocks=None,
+                 pretrained2d=True,
+                 in_channels=3,
+                 num_stages=4,
+                 base_channels=64,
+                 out_indices=(3, ),
+                 spatial_strides=(1, 2, 2, 2),
+                 temporal_strides=(1, 1, 1, 1),
+                 dilations=(1, 1, 1, 1),
+                 conv1_kernel=(3, 7, 7),
+                 conv1_stride_s=2,
+                 conv1_stride_t=1,
+                 pool1_stride_s=2,
+                 pool1_stride_t=1,
+                 with_pool1=True,
+                 with_pool2=True,
+                 inflate=(1, 1, 1, 1),
+                 inflate_style='3x1x1',
+                 conv_cfg=dict(type='Conv3d'),
+                 norm_cfg=dict(type='BN3d', requires_grad=True),
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_eval=False,
+                 with_cp=False,
+                 non_local=(0, 0, 0, 0),
+                 non_local_cfg=dict(),
+                 zero_init_residual=True,
+                 **kwargs):
+        super().__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+        self.depth = depth
+        self.pretrained2d = pretrained2d
+        self.in_channels = in_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert 1 <= num_stages <= 4
+        self.stage_blocks = stage_blocks
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.spatial_strides = spatial_strides
+        self.temporal_strides = temporal_strides
+        self.dilations = dilations
+        assert len(spatial_strides) == len(temporal_strides) == len(
+            dilations) == num_stages
+        if self.stage_blocks is not None:
+            assert len(self.stage_blocks) == num_stages
+
+        self.conv1_kernel = conv1_kernel
+        self.conv1_stride_s = conv1_stride_s
+        self.conv1_stride_t = conv1_stride_t
+        self.pool1_stride_s = pool1_stride_s
+        self.pool1_stride_t = pool1_stride_t
+        self.with_pool1 = with_pool1
+        self.with_pool2 = with_pool2
+        self.stage_inflations = _ntuple(num_stages)(inflate)
+        self.non_local_stages = _ntuple(num_stages)(non_local)
+        self.inflate_style = inflate_style
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.zero_init_residual = zero_init_residual
+
+        self.block, stage_blocks = self.arch_settings[depth]
+
+        if self.stage_blocks is None:
+            self.stage_blocks = stage_blocks[:num_stages]
+
+        self.inplanes = self.base_channels
+
+        self.non_local_cfg = non_local_cfg
+
+        self._make_stem_layer()
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            spatial_stride = spatial_strides[i]
+            temporal_stride = temporal_strides[i]
+            dilation = dilations[i]
+            planes = self.base_channels * 2**i
+            res_layer = self.make_res_layer(
+                self.block,
+                self.inplanes,
+                planes,
+                num_blocks,
+                spatial_stride=spatial_stride,
+                temporal_stride=temporal_stride,
+                dilation=dilation,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg,
+                act_cfg=self.act_cfg,
+                non_local=self.non_local_stages[i],
+                non_local_cfg=self.non_local_cfg,
+                inflate=self.stage_inflations[i],
+                inflate_style=self.inflate_style,
+                with_cp=with_cp,
+                **kwargs)
+            self.inplanes = planes * self.block.expansion
+            layer_name = f'layer{i + 1}'
+            self.add_sublayer(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self.feat_dim = self.block.expansion * self.base_channels * 2**(
+            len(self.stage_blocks) - 1)
+
+    @staticmethod
+    def make_res_layer(block,
+                       inplanes,
+                       planes,
+                       blocks,
+                       spatial_stride=1,
+                       temporal_stride=1,
+                       dilation=1,
+                       inflate=1,
+                       inflate_style='3x1x1',
+                       non_local=0,
+                       non_local_cfg=dict(),
+                       norm_cfg=None,
+                       act_cfg=None,
+                       conv_cfg=None,
+                       with_cp=False,
+                       **kwargs):
+        """Build residual layer for ResNet3D.
+
+        Args:
+            block (nn.Module): Residual module to be built.
+            inplanes (int): Number of channels for the input feature
+                in each block.
+            planes (int): Number of channels for the output feature
+                in each block.
+            blocks (int): Number of residual blocks.
+            spatial_stride (int | Sequence[int]): Spatial strides in
+                residual and conv layers. Default: 1.
+            temporal_stride (int | Sequence[int]): Temporal strides in
+                residual and conv layers. Default: 1.
+            dilation (int): Spacing between kernel elements. Default: 1.
+            inflate (int | Sequence[int]): Determine whether to inflate
+                for each block. Default: 1.
+            inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines
+                the kernel sizes and padding strides for conv1 and conv2
+                in each block. Default: '3x1x1'.
+            non_local (int | Sequence[int]): Determine whether to apply
+                non-local module in the corresponding block of each stages.
+                Default: 0.
+            non_local_cfg (dict): Config for non-local module.
+                Default: ``dict()``.
+            conv_cfg (dict | None): Config for norm layers. Default: None.
+            norm_cfg (dict | None): Config for norm layers. Default: None.
+            act_cfg (dict | None): Config for activate layers. Default: None.
+            with_cp (bool | None): Use checkpoint or not. Using checkpoint
+                will save some memory while slowing down the training speed.
+                Default: False.
+
+        Returns:
+            nn.Module: A residual layer for the given config.
+        """
+        inflate = inflate if not isinstance(inflate,
+                                            int) else (inflate, ) * blocks
+        non_local = non_local if not isinstance(non_local,
+                                                int) else (non_local, ) * blocks
+        assert len(inflate) == blocks and len(non_local) == blocks
+        downsample = None
+        if spatial_stride != 1 or inplanes != planes * block.expansion:
+            downsample = ConvBNLayer(
+                in_channels=inplanes,
+                out_channels=planes * block.expansion,
+                kernel_size=1,
+                stride=(temporal_stride, spatial_stride, spatial_stride),
+                bias=False,
+                act=None)
+
+        layers = []
+        layers.append(
+            block(
+                inplanes,
+                planes,
+                spatial_stride=spatial_stride,
+                temporal_stride=temporal_stride,
+                dilation=dilation,
+                downsample=downsample,
+                inflate=(inflate[0] == 1),
+                inflate_style=inflate_style,
+                non_local=(non_local[0] == 1),
+                non_local_cfg=non_local_cfg,
+                norm_cfg=norm_cfg,
+                conv_cfg=conv_cfg,
+                act_cfg=act_cfg,
+                with_cp=with_cp,
+                **kwargs))
+        inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    inplanes,
+                    planes,
+                    spatial_stride=1,
+                    temporal_stride=1,
+                    dilation=dilation,
+                    inflate=(inflate[i] == 1),
+                    inflate_style=inflate_style,
+                    non_local=(non_local[i] == 1),
+                    non_local_cfg=non_local_cfg,
+                    norm_cfg=norm_cfg,
+                    conv_cfg=conv_cfg,
+                    act_cfg=act_cfg,
+                    with_cp=with_cp,
+                    **kwargs))
+
+        return nn.Sequential(*layers)
+
+    @staticmethod
+    def _inflate_conv_params(conv3d, state_dict_2d, module_name_2d,
+                             inflated_param_names):
+        """Inflate a conv module from 2d to 3d.
+
+        Args:
+            conv3d (nn.Module): The destination conv3d module.
+            state_dict_2d (OrderedDict): The state dict of pretrained 2d model.
+            module_name_2d (str): The name of corresponding conv module in the
+                2d model.
+            inflated_param_names (list[str]): List of parameters that have been
+                inflated.
+        """
+        weight_2d_name = module_name_2d + '.weight'
+
+        conv2d_weight = state_dict_2d[weight_2d_name]
+        kernel_t = conv3d.weight.data.shape[2]
+
+        new_weight = conv2d_weight.data.unsqueeze(2).expand_as(
+            conv3d.weight) / kernel_t
+        conv3d.weight.data.copy_(new_weight)
+        inflated_param_names.append(weight_2d_name)
+
+        if getattr(conv3d, 'bias') is not None:
+            bias_2d_name = module_name_2d + '.bias'
+            conv3d.bias.data.copy_(state_dict_2d[bias_2d_name])
+            inflated_param_names.append(bias_2d_name)
+
+    @staticmethod
+    def _inflate_bn_params(bn3d, state_dict_2d, module_name_2d,
+                           inflated_param_names):
+        """Inflate a norm module from 2d to 3d.
+
+        Args:
+            bn3d (nn.Module): The destination bn3d module.
+            state_dict_2d (OrderedDict): The state dict of pretrained 2d model.
+            module_name_2d (str): The name of corresponding bn module in the
+                2d model.
+            inflated_param_names (list[str]): List of parameters that have been
+                inflated.
+        """
+        for param_name, param in bn3d.named_parameters():
+            param_2d_name = f'{module_name_2d}.{param_name}'
+            param_2d = state_dict_2d[param_2d_name]
+            if param.data.shape != param_2d.shape:
+                warnings.warn(f'The parameter of {module_name_2d} is not'
+                              'loaded due to incompatible shapes. ')
+                return
+
+            param.data.copy_(param_2d)
+            inflated_param_names.append(param_2d_name)
+
+        for param_name, param in bn3d.named_buffers():
+            param_2d_name = f'{module_name_2d}.{param_name}'
+            # some buffers like num_batches_tracked may not exist in old
+            # checkpoints
+            if param_2d_name in state_dict_2d:
+                param_2d = state_dict_2d[param_2d_name]
+                param.data.copy_(param_2d)
+                inflated_param_names.append(param_2d_name)
+
+    def _make_stem_layer(self):
+        """Construct the stem layers consists of a conv+norm+act module and a
+        pooling layer."""
+
+        self.conv1 = ConvBNLayer(
+            in_channels=self.in_channels,
+            out_channels=self.base_channels,
+            kernel_size=self.conv1_kernel,
+            stride=(self.conv1_stride_t, self.conv1_stride_s,
+                    self.conv1_stride_s),
+            padding=tuple([(k - 1) // 2 for k in _triple(self.conv1_kernel)]),
+            bias=False,
+            act="relu")
+
+        self.maxpool = nn.MaxPool3D(
+            kernel_size=(1, 3, 3),
+            stride=(self.pool1_stride_t, self.pool1_stride_s,
+                    self.pool1_stride_s),
+            padding=(0, 1, 1))
+
+        self.pool2 = nn.MaxPool3D(kernel_size=(2, 1, 1), stride=(2, 1, 1))
+
+    @staticmethod
+    def _init_weights(self, pretrained=None):
+        pass
+
+    def init_weights(self, pretrained=None):
+        self._init_weights(self, pretrained)
+
+    def forward(self, x):
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor: The feature of the input
+            samples extracted by the backbone.
+        """
+        x = self.conv1(x)
+        if self.with_pool1:
+            x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i == 0 and self.with_pool2:
+                x = self.pool2(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Set the optimization status when training."""
+        super().train()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, paddle.nn._BatchNormBase):
+                    m.eval()
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet3d_slowonly.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet3d_slowonly.py
new file mode 100644
index 0000000..eb5b080
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet3d_slowonly.py
@@ -0,0 +1,214 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+import paddle
+import paddle.nn as nn
+
+from .resnet3d import ResNet3d, ConvBNLayer
+from ..registry import BACKBONES
+
+
+@BACKBONES.register()
+class ResNet3dSlowOnly(ResNet3d):
+    """A pathway of Slowfast based on ResNet3d.
+
+    Args:
+        *args (arguments): Arguments same as :class:``ResNet3d``.
+        channel_ratio (int): Reduce the channel number of fast pathway
+            by ``channel_ratio``, corresponding to ``beta`` in the paper.
+            Default: 8.
+        **kwargs (keyword arguments): Keywords arguments for ResNet3d.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.inplanes = self.base_channels
+
+        self.lateral_connections = []
+        for i in range(len(self.stage_blocks)):
+            planes = self.base_channels * 2**i
+            self.inplanes = planes * self.block.expansion
+
+    def make_res_layer(self,
+                       block,
+                       inplanes,
+                       planes,
+                       blocks,
+                       spatial_stride=1,
+                       temporal_stride=1,
+                       dilation=1,
+                       inflate=1,
+                       inflate_style='3x1x1',
+                       non_local=0,
+                       non_local_cfg=dict(),
+                       conv_cfg=None,
+                       norm_cfg=None,
+                       act_cfg=None,
+                       with_cp=False):
+        """Build residual layer for Slowfast.
+
+        Args:
+            block (nn.Module): Residual module to be built.
+            inplanes (int): Number of channels for the input
+                feature in each block.
+            planes (int): Number of channels for the output
+                feature in each block.
+            blocks (int): Number of residual blocks.
+            spatial_stride (int | Sequence[int]): Spatial strides
+                in residual and conv layers. Default: 1.
+            temporal_stride (int | Sequence[int]): Temporal strides in
+                residual and conv layers. Default: 1.
+            dilation (int): Spacing between kernel elements. Default: 1.
+            inflate (int | Sequence[int]): Determine whether to inflate
+                for each block. Default: 1.
+            inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines
+                the kernel sizes and padding strides for conv1 and
+                conv2 in each block. Default: ``3x1x1``.
+            non_local (int | Sequence[int]): Determine whether to apply
+                non-local module in the corresponding block of each stages.
+                Default: 0.
+            non_local_cfg (dict): Config for non-local module.
+                Default: ``dict()``.
+            conv_cfg (dict | None): Config for conv layers. Default: None.
+            norm_cfg (dict | None): Config for norm layers. Default: None.
+            act_cfg (dict | None): Config for activate layers. Default: None.
+            with_cp (bool): Use checkpoint or not. Using checkpoint will save
+                some memory while slowing down the training speed.
+                Default: False.
+
+        Returns:
+            nn.Module: A residual layer for the given config.
+        """
+        inflate = inflate if not isinstance(inflate,
+                                            int) else (inflate, ) * blocks
+        non_local = non_local if not isinstance(non_local,
+                                                int) else (non_local, ) * blocks
+        assert len(inflate) == blocks and len(non_local) == blocks
+
+        lateral_inplanes = 0
+        if (spatial_stride != 1
+                or (inplanes + lateral_inplanes) != planes * block.expansion):
+            downsample = ConvBNLayer(
+                in_channels=inplanes + lateral_inplanes,
+                out_channels=planes * block.expansion,
+                kernel_size=1,
+                stride=(temporal_stride, spatial_stride, spatial_stride),
+                bias=False,
+                act=None)
+        else:
+            downsample = None
+
+        layers = []
+        layers.append(
+            block(
+                inplanes + lateral_inplanes,
+                planes,
+                spatial_stride,
+                temporal_stride,
+                dilation,
+                downsample,
+                inflate=(inflate[0] == 1),
+                inflate_style=inflate_style,
+                non_local=(non_local[0] == 1),
+                non_local_cfg=non_local_cfg,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                with_cp=with_cp))
+        inplanes = planes * block.expansion
+
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    inplanes,
+                    planes,
+                    1,
+                    1,
+                    dilation,
+                    inflate=(inflate[i] == 1),
+                    inflate_style=inflate_style,
+                    non_local=(non_local[i] == 1),
+                    non_local_cfg=non_local_cfg,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    with_cp=with_cp))
+
+        return nn.Sequential(*layers)
+
+    def _inflate_conv_params(self, conv3d, state_dict_2d, module_name_2d,
+                             inflated_param_names):
+        """Inflate a conv module from 2d to 3d.
+
+        The differences of conv modules betweene 2d and 3d in Pathway
+        mainly lie in the inplanes due to lateral connections. To fit the
+        shapes of the lateral connection counterpart, it will expand
+        parameters by concatting conv2d parameters and extra zero paddings.
+
+        Args:
+            conv3d (nn.Module): The destination conv3d module.
+            state_dict_2d (OrderedDict): The state dict of pretrained 2d model.
+            module_name_2d (str): The name of corresponding conv module in the
+                2d model.
+            inflated_param_names (list[str]): List of parameters that have been
+                inflated.
+        """
+        weight_2d_name = module_name_2d + '.weight'
+        conv2d_weight = state_dict_2d[weight_2d_name]
+        old_shape = conv2d_weight.shape
+        new_shape = conv3d.weight.data.shape
+        kernel_t = new_shape[2]
+
+        if new_shape[1] != old_shape[1]:
+            if new_shape[1] < old_shape[1]:
+                warnings.warn(f'The parameter of {module_name_2d} is not'
+                              'loaded due to incompatible shapes. ')
+                return
+            # Inplanes may be different due to lateral connections
+            new_channels = new_shape[1] - old_shape[1]
+            pad_shape = old_shape
+            pad_shape = pad_shape[:1] + (new_channels, ) + pad_shape[2:]
+            # Expand parameters by concat extra channels
+            conv2d_weight = paddle.concat(
+                (conv2d_weight, paddle.zeros(pad_shape)), axis=1)
+
+        new_weight = conv2d_weight.data.unsqueeze(2).expand_as(
+            conv3d.weight) / kernel_t
+        conv3d.weight.data.copy_(new_weight)
+        inflated_param_names.append(weight_2d_name)
+
+        if getattr(conv3d, 'bias') is not None:
+            bias_2d_name = module_name_2d + '.bias'
+            conv3d.bias.data.copy_(state_dict_2d[bias_2d_name])
+            inflated_param_names.append(bias_2d_name)
+
+
+if __name__ == '__main__':
+    net = ResNet3dSlowOnly(
+        depth=50,
+        in_channels=17,
+        base_channels=32,
+        conv1_kernel=(1, 7, 7),
+        num_stages=3,
+        out_indices=[2],
+        stage_blocks=[3, 4, 6],
+        conv1_stride_s=1,
+        pool1_stride_s=1,
+        inflate=[0, 1, 1],
+        with_pool2=False,
+        spatial_strides=[2, 2, 2],
+        temporal_strides=[1, 1, 2],
+        dilations=[1, 1, 1])
+    pass
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_slowfast.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_slowfast.py
new file mode 100644
index 0000000..a679159
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_slowfast.py
@@ -0,0 +1,795 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+from paddle.nn.initializer import KaimingNormal
+from ..registry import BACKBONES
+from paddlevideo.utils.multigrid import get_norm
+import sys
+import numpy as np
+import paddle.distributed as dist
+
+# seed random seed
+paddle.framework.seed(0)
+
+
+# get init parameters for conv layer
+def get_conv_init(fan_out):
+    return KaimingNormal(fan_in=fan_out)
+
+
+def get_bn_param_attr(bn_weight=1.0, coeff=0.0):
+    param_attr = paddle.ParamAttr(
+        initializer=paddle.nn.initializer.Constant(bn_weight),
+        regularizer=paddle.regularizer.L2Decay(coeff))
+    return param_attr
+
+
+"""Video models."""
+
+
+class BottleneckTransform(paddle.nn.Layer):
+    """
+    Bottleneck transformation: Tx1x1, 1x3x3, 1x1x1, where T is the size of
+        temporal kernel.
+    """
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 temp_kernel_size,
+                 stride,
+                 dim_inner,
+                 num_groups,
+                 stride_1x1=False,
+                 inplace_relu=True,
+                 eps=1e-5,
+                 dilation=1,
+                 norm_module=paddle.nn.BatchNorm3D):
+        """
+        Args:
+            dim_in (int): the channel dimensions of the input.
+            dim_out (int): the channel dimension of the output.
+            temp_kernel_size (int): the temporal kernel sizes of the middle
+                convolution in the bottleneck.
+            stride (int): the stride of the bottleneck.
+            dim_inner (int): the inner dimension of the block.
+            num_groups (int): number of groups for the convolution. num_groups=1
+                is for standard ResNet like networks, and num_groups>1 is for
+                ResNeXt like networks.
+            stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise
+                apply stride to the 3x3 conv.
+            inplace_relu (bool): if True, calculate the relu on the original
+                input without allocating new memory.
+            eps (float): epsilon for batch norm.
+            dilation (int): size of dilation.
+        """
+        super(BottleneckTransform, self).__init__()
+        self.temp_kernel_size = temp_kernel_size
+        self._inplace_relu = inplace_relu
+        self._eps = eps
+        self._stride_1x1 = stride_1x1
+        self.norm_module = norm_module
+        self._construct(dim_in, dim_out, stride, dim_inner, num_groups,
+                        dilation)
+
+    def _construct(self, dim_in, dim_out, stride, dim_inner, num_groups,
+                   dilation):
+        str1x1, str3x3 = (stride, 1) if self._stride_1x1 else (1, stride)
+
+        fan = (dim_inner) * (self.temp_kernel_size * 1 * 1)
+        initializer_tmp = get_conv_init(fan)
+
+        self.a = paddle.nn.Conv3D(
+            in_channels=dim_in,
+            out_channels=dim_inner,
+            kernel_size=[self.temp_kernel_size, 1, 1],
+            stride=[1, str1x1, str1x1],
+            padding=[int(self.temp_kernel_size // 2), 0, 0],
+            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+            bias_attr=False)
+        self.a_bn = self.norm_module(num_features=dim_inner,
+                                     epsilon=self._eps,
+                                     weight_attr=get_bn_param_attr(),
+                                     bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+        # 1x3x3, BN, ReLU.
+        fan = (dim_inner) * (1 * 3 * 3)
+        initializer_tmp = get_conv_init(fan)
+
+        self.b = paddle.nn.Conv3D(
+            in_channels=dim_inner,
+            out_channels=dim_inner,
+            kernel_size=[1, 3, 3],
+            stride=[1, str3x3, str3x3],
+            padding=[0, dilation, dilation],
+            groups=num_groups,
+            dilation=[1, dilation, dilation],
+            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+            bias_attr=False)
+        self.b_bn = self.norm_module(num_features=dim_inner,
+                                     epsilon=self._eps,
+                                     weight_attr=get_bn_param_attr(),
+                                     bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+        # 1x1x1, BN.
+        fan = (dim_out) * (1 * 1 * 1)
+        initializer_tmp = get_conv_init(fan)
+
+        self.c = paddle.nn.Conv3D(
+            in_channels=dim_inner,
+            out_channels=dim_out,
+            kernel_size=[1, 1, 1],
+            stride=[1, 1, 1],
+            padding=[0, 0, 0],
+            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+            bias_attr=False)
+        self.c_bn = self.norm_module(
+            num_features=dim_out,
+            epsilon=self._eps,
+            weight_attr=get_bn_param_attr(bn_weight=0.0),
+            bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+    def forward(self, x):
+        # Branch2a.
+        x = self.a(x)
+        x = self.a_bn(x)
+        x = F.relu(x)
+
+        # Branch2b.
+        x = self.b(x)
+        x = self.b_bn(x)
+        x = F.relu(x)
+
+        # Branch2c
+        x = self.c(x)
+        x = self.c_bn(x)
+        return x
+
+
+class ResBlock(paddle.nn.Layer):
+    """
+    Residual block.
+    """
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 temp_kernel_size,
+                 stride,
+                 dim_inner,
+                 num_groups=1,
+                 stride_1x1=False,
+                 inplace_relu=True,
+                 eps=1e-5,
+                 dilation=1,
+                 norm_module=paddle.nn.BatchNorm3D):
+        """
+        ResBlock class constructs redisual blocks. More details can be found in:
+            Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun.
+            "Deep residual learning for image recognition."
+            https://arxiv.org/abs/1512.03385
+        Args:
+            dim_in (int): the channel dimensions of the input.
+            dim_out (int): the channel dimension of the output.
+            temp_kernel_size (int): the temporal kernel sizes of the middle
+                convolution in the bottleneck.
+            stride (int): the stride of the bottleneck.
+            trans_func (string): transform function to be used to construct the
+                bottleneck.
+            dim_inner (int): the inner dimension of the block.
+            num_groups (int): number of groups for the convolution. num_groups=1
+                is for standard ResNet like networks, and num_groups>1 is for
+                ResNeXt like networks.
+            stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise
+                apply stride to the 3x3 conv.
+            inplace_relu (bool): calculate the relu on the original input
+                without allocating new memory.
+            eps (float): epsilon for batch norm.
+            dilation (int): size of dilation.
+        """
+        super(ResBlock, self).__init__()
+        self._inplace_relu = inplace_relu
+        self._eps = eps
+        self.norm_module = norm_module
+        self._construct(
+            dim_in,
+            dim_out,
+            temp_kernel_size,
+            stride,
+            dim_inner,
+            num_groups,
+            stride_1x1,
+            inplace_relu,
+            dilation,
+        )
+
+    def _construct(
+        self,
+        dim_in,
+        dim_out,
+        temp_kernel_size,
+        stride,
+        dim_inner,
+        num_groups,
+        stride_1x1,
+        inplace_relu,
+        dilation,
+    ):
+        # Use skip connection with projection if dim or res change.
+        if (dim_in != dim_out) or (stride != 1):
+            fan = (dim_out) * (1 * 1 * 1)
+            initializer_tmp = get_conv_init(fan)
+            self.branch1 = paddle.nn.Conv3D(
+                in_channels=dim_in,
+                out_channels=dim_out,
+                kernel_size=1,
+                stride=[1, stride, stride],
+                padding=0,
+                weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+                bias_attr=False,
+                dilation=1)
+            self.branch1_bn = self.norm_module(
+                num_features=dim_out,
+                epsilon=self._eps,
+                weight_attr=get_bn_param_attr(),
+                bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+        self.branch2 = BottleneckTransform(dim_in,
+                                           dim_out,
+                                           temp_kernel_size,
+                                           stride,
+                                           dim_inner,
+                                           num_groups,
+                                           stride_1x1=stride_1x1,
+                                           inplace_relu=inplace_relu,
+                                           dilation=dilation,
+                                           norm_module=self.norm_module)
+
+    def forward(self, x):
+        if hasattr(self, "branch1"):
+            x1 = self.branch1(x)
+            x1 = self.branch1_bn(x1)
+            x2 = self.branch2(x)
+            x = paddle.add(x=x1, y=x2)
+        else:
+            x2 = self.branch2(x)
+            x = paddle.add(x=x, y=x2)
+
+        x = F.relu(x)
+        return x
+
+
+class ResStage(paddle.nn.Layer):
+    """
+    Stage of 3D ResNet. It expects to have one or more tensors as input for
+        multi-pathway (SlowFast) cases.  More details can be found here:
+
+        Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.
+        "Slowfast networks for video recognition."
+        https://arxiv.org/pdf/1812.03982.pdf
+    """
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 stride,
+                 temp_kernel_sizes,
+                 num_blocks,
+                 dim_inner,
+                 num_groups,
+                 num_block_temp_kernel,
+                 dilation,
+                 stride_1x1=False,
+                 inplace_relu=True,
+                 norm_module=paddle.nn.BatchNorm3D):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+        ResStage builds p streams, where p can be greater or equal to one.
+        Args:
+            dim_in (list): list of p the channel dimensions of the input.
+                Different channel dimensions control the input dimension of
+                different pathways.
+            dim_out (list): list of p the channel dimensions of the output.
+                Different channel dimensions control the input dimension of
+                different pathways.
+            temp_kernel_sizes (list): list of the p temporal kernel sizes of the
+                convolution in the bottleneck. Different temp_kernel_sizes
+                control different pathway.
+            stride (list): list of the p strides of the bottleneck. Different
+                stride control different pathway.
+            num_blocks (list): list of p numbers of blocks for each of the
+                pathway.
+            dim_inner (list): list of the p inner channel dimensions of the
+                input. Different channel dimensions control the input dimension
+                of different pathways.
+            num_groups (list): list of number of p groups for the convolution.
+                num_groups=1 is for standard ResNet like networks, and
+                num_groups>1 is for ResNeXt like networks.
+            num_block_temp_kernel (list): extent the temp_kernel_sizes to
+                num_block_temp_kernel blocks, then fill temporal kernel size
+                of 1 for the rest of the layers.
+            dilation (list): size of dilation for each pathway.
+        """
+        super(ResStage, self).__init__()
+        assert all((num_block_temp_kernel[i] <= num_blocks[i]
+                    for i in range(len(temp_kernel_sizes))))
+        self.num_blocks = num_blocks
+        self.temp_kernel_sizes = [
+            (temp_kernel_sizes[i] * num_blocks[i])[:num_block_temp_kernel[i]] +
+            [1] * (num_blocks[i] - num_block_temp_kernel[i])
+            for i in range(len(temp_kernel_sizes))
+        ]
+        assert (len({
+            len(dim_in),
+            len(dim_out),
+            len(temp_kernel_sizes),
+            len(stride),
+            len(num_blocks),
+            len(dim_inner),
+            len(num_groups),
+            len(num_block_temp_kernel),
+        }) == 1)
+        self.num_pathways = len(self.num_blocks)
+        self.norm_module = norm_module
+        self._construct(
+            dim_in,
+            dim_out,
+            stride,
+            dim_inner,
+            num_groups,
+            stride_1x1,
+            inplace_relu,
+            dilation,
+        )
+
+    def _construct(
+        self,
+        dim_in,
+        dim_out,
+        stride,
+        dim_inner,
+        num_groups,
+        stride_1x1,
+        inplace_relu,
+        dilation,
+    ):
+
+        for pathway in range(self.num_pathways):
+            for i in range(self.num_blocks[pathway]):
+                res_block = ResBlock(
+                    dim_in[pathway] if i == 0 else dim_out[pathway],
+                    dim_out[pathway],
+                    self.temp_kernel_sizes[pathway][i],
+                    stride[pathway] if i == 0 else 1,
+                    dim_inner[pathway],
+                    num_groups[pathway],
+                    stride_1x1=stride_1x1,
+                    inplace_relu=inplace_relu,
+                    dilation=dilation[pathway],
+                    norm_module=self.norm_module)
+                self.add_sublayer("pathway{}_res{}".format(pathway, i),
+                                  res_block)
+
+    def forward(self, inputs):
+        output = []
+        for pathway in range(self.num_pathways):
+            x = inputs[pathway]
+
+            for i in range(self.num_blocks[pathway]):
+                m = getattr(self, "pathway{}_res{}".format(pathway, i))
+                x = m(x)
+            output.append(x)
+
+        return output
+
+
+class ResNetBasicStem(paddle.nn.Layer):
+    """
+    ResNe(X)t 3D stem module.
+    Performs spatiotemporal Convolution, BN, and Relu following by a
+        spatiotemporal pooling.
+    """
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 kernel,
+                 stride,
+                 padding,
+                 eps=1e-5,
+                 norm_module=paddle.nn.BatchNorm3D):
+        super(ResNetBasicStem, self).__init__()
+        self.kernel = kernel
+        self.stride = stride
+        self.padding = padding
+        self.eps = eps
+        self.norm_module = norm_module
+        self._construct_stem(dim_in, dim_out)
+
+    def _construct_stem(self, dim_in, dim_out):
+        fan = (dim_out) * (self.kernel[0] * self.kernel[1] * self.kernel[2])
+        initializer_tmp = get_conv_init(fan)
+
+        self._conv = paddle.nn.Conv3D(
+            in_channels=dim_in,
+            out_channels=dim_out,
+            kernel_size=self.kernel,
+            stride=self.stride,
+            padding=self.padding,
+            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+            bias_attr=False)
+        self._bn = self.norm_module(num_features=dim_out,
+                                    epsilon=self.eps,
+                                    weight_attr=get_bn_param_attr(),
+                                    bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._bn(x)
+        x = F.relu(x)
+
+        x = F.max_pool3d(x=x,
+                         kernel_size=[1, 3, 3],
+                         stride=[1, 2, 2],
+                         padding=[0, 1, 1],
+                         data_format="NCDHW")
+        return x
+
+
+class VideoModelStem(paddle.nn.Layer):
+    """
+    Video 3D stem module. Provides stem operations of Conv, BN, ReLU, MaxPool
+    on input data tensor for slow and fast pathways.
+    """
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 kernel,
+                 stride,
+                 padding,
+                 eps=1e-5,
+                 norm_module=paddle.nn.BatchNorm3D):
+        """
+        Args:
+            dim_in (list): the list of channel dimensions of the inputs.
+            dim_out (list): the output dimension of the convolution in the stem
+                layer.
+            kernel (list): the kernels' size of the convolutions in the stem
+                layers. Temporal kernel size, height kernel size, width kernel
+                size in order.
+            stride (list): the stride sizes of the convolutions in the stem
+                layer. Temporal kernel stride, height kernel size, width kernel
+                size in order.
+            padding (list): the paddings' sizes of the convolutions in the stem
+                layer. Temporal padding size, height padding size, width padding
+                size in order.
+            eps (float): epsilon for batch norm.
+        """
+        super(VideoModelStem, self).__init__()
+
+        assert (len({
+            len(dim_in),
+            len(dim_out),
+            len(kernel),
+            len(stride),
+            len(padding),
+        }) == 1), "Input pathway dimensions are not consistent."
+        self.num_pathways = len(dim_in)
+        self.kernel = kernel
+        self.stride = stride
+        self.padding = padding
+        self.eps = eps
+        self.norm_module = norm_module
+        self._construct_stem(dim_in, dim_out)
+
+    def _construct_stem(self, dim_in, dim_out):
+        for pathway in range(len(dim_in)):
+            stem = ResNetBasicStem(dim_in[pathway], dim_out[pathway],
+                                   self.kernel[pathway], self.stride[pathway],
+                                   self.padding[pathway], self.eps,
+                                   self.norm_module)
+            self.add_sublayer("pathway{}_stem".format(pathway), stem)
+
+    def forward(self, x):
+        assert (len(x) == self.num_pathways
+                ), "Input tensor does not contain {} pathway".format(
+                    self.num_pathways)
+
+        for pathway in range(len(x)):
+            m = getattr(self, "pathway{}_stem".format(pathway))
+            x[pathway] = m(x[pathway])
+
+        return x
+
+
+class FuseFastToSlow(paddle.nn.Layer):
+    """
+    Fuses the information from the Fast pathway to the Slow pathway. Given the
+    tensors from Slow pathway and Fast pathway, fuse information from Fast to
+    Slow, then return the fused tensors from Slow and Fast pathway in order.
+    """
+    def __init__(self,
+                 dim_in,
+                 fusion_conv_channel_ratio,
+                 fusion_kernel,
+                 alpha,
+                 fuse_bn_relu=1,
+                 eps=1e-5,
+                 norm_module=paddle.nn.BatchNorm3D):
+        """
+        Args:
+            dim_in (int): the channel dimension of the input.
+            fusion_conv_channel_ratio (int): channel ratio for the convolution
+                used to fuse from Fast pathway to Slow pathway.
+            fusion_kernel (int): kernel size of the convolution used to fuse
+                from Fast pathway to Slow pathway.
+            alpha (int): the frame rate ratio between the Fast and Slow pathway.
+            eps (float): epsilon for batch norm.
+        """
+        super(FuseFastToSlow, self).__init__()
+        self.fuse_bn_relu = fuse_bn_relu
+        fan = (dim_in * fusion_conv_channel_ratio) * (fusion_kernel * 1 * 1)
+        initializer_tmp = get_conv_init(fan)
+
+        self._conv_f2s = paddle.nn.Conv3D(
+            in_channels=dim_in,
+            out_channels=dim_in * fusion_conv_channel_ratio,
+            kernel_size=[fusion_kernel, 1, 1],
+            stride=[alpha, 1, 1],
+            padding=[fusion_kernel // 2, 0, 0],
+            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+            bias_attr=False)
+        self._bn = norm_module(num_features=dim_in * fusion_conv_channel_ratio,
+                               epsilon=eps,
+                               weight_attr=get_bn_param_attr(),
+                               bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+    def forward(self, x):
+        x_s = x[0]
+        x_f = x[1]
+        fuse = self._conv_f2s(x_f)
+        #  TODO: For AVA, set fuse_bn_relu=1, check mAP's improve.
+        if self.fuse_bn_relu:
+            fuse = self._bn(fuse)
+            fuse = F.relu(fuse)
+        x_s_fuse = paddle.concat(x=[x_s, fuse], axis=1, name=None)
+
+        return [x_s_fuse, x_f]
+
+
+@BACKBONES.register()
+class ResNetSlowFast(paddle.nn.Layer):
+    """
+    SlowFast model builder for SlowFast network.
+
+    Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.
+    "Slowfast networks for video recognition."
+    https://arxiv.org/pdf/1812.03982.pdf
+    """
+    def __init__(
+        self,
+        alpha,
+        beta,
+        bn_norm_type="batchnorm",
+        bn_num_splits=1,
+        num_pathways=2,
+        depth=50,
+        num_groups=1,
+        input_channel_num=[3, 3],
+        width_per_group=64,
+        fusion_conv_channel_ratio=2,
+        fusion_kernel_sz=7,  #5?
+        pool_size_ratio=[[1, 1, 1], [1, 1, 1]],
+        fuse_bn_relu = 1,
+        spatial_strides = [[1, 1], [2, 2], [2, 2], [2, 2]],
+        use_pool_af_s2 = 1,
+    ):
+        """
+        Args:
+            cfg (CfgNode): model building configs, details are in the
+                comments of the config file.
+        """
+        super(ResNetSlowFast, self).__init__()
+
+        self.alpha = alpha  #8
+        self.beta = beta  #8
+        self.norm_module = get_norm(bn_norm_type, bn_num_splits)
+        self.num_pathways = num_pathways
+        self.depth = depth
+        self.num_groups = num_groups
+        self.input_channel_num = input_channel_num
+        self.width_per_group = width_per_group
+        self.fusion_conv_channel_ratio = fusion_conv_channel_ratio
+        self.fusion_kernel_sz = fusion_kernel_sz  # NOTE: modify to 7 in 8*8, 5 in old implement
+        self.pool_size_ratio = pool_size_ratio
+        self.fuse_bn_relu = fuse_bn_relu
+        self.spatial_strides = spatial_strides
+        self.use_pool_af_s2 = use_pool_af_s2
+        self._construct_network()
+
+    def _construct_network(self):
+        """
+        Builds a SlowFast model.
+        The first pathway is the Slow pathway
+        and the second pathway is the Fast pathway.
+
+        Args:
+            cfg (CfgNode): model building configs, details are in the
+                comments of the config file.
+        """
+        temp_kernel = [
+            [[1], [5]],  # conv1 temporal kernel for slow and fast pathway.
+            [[1], [3]],  # res2 temporal kernel for slow and fast pathway.
+            [[1], [3]],  # res3 temporal kernel for slow and fast pathway.
+            [[3], [3]],  # res4 temporal kernel for slow and fast pathway.
+            [[3], [3]],
+        ]  # res5 temporal kernel for slow and fast pathway.
+
+        self.s1 = VideoModelStem(
+            dim_in=self.input_channel_num,
+            dim_out=[self.width_per_group, self.width_per_group // self.beta],
+            kernel=[temp_kernel[0][0] + [7, 7], temp_kernel[0][1] + [7, 7]],
+            stride=[[1, 2, 2]] * 2,
+            padding=[
+                [temp_kernel[0][0][0] // 2, 3, 3],
+                [temp_kernel[0][1][0] // 2, 3, 3],
+            ],
+            norm_module=self.norm_module)
+        self.s1_fuse = FuseFastToSlow(
+            dim_in=self.width_per_group // self.beta,
+            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+            fusion_kernel=self.fusion_kernel_sz,
+            alpha=self.alpha,
+            norm_module=self.norm_module,
+            fuse_bn_relu=self.fuse_bn_relu)
+
+        # ResNet backbone
+        MODEL_STAGE_DEPTH = {50: (3, 4, 6, 3)}
+        (d2, d3, d4, d5) = MODEL_STAGE_DEPTH[self.depth]
+
+        num_block_temp_kernel = [[3, 3], [4, 4], [6, 6], [3, 3]]
+        spatial_dilations = [[1, 1], [1, 1], [1, 1], [1, 1]]
+        spatial_strides = self.spatial_strides
+        #spatial_strides = [[1, 1], [2, 2], [2, 2], [2, 2]]
+        #spatial_strides = [[1, 1], [2, 2], [2, 2], [1, 1]] #TODO:check which value is FAIR's impliment
+
+        out_dim_ratio = self.beta // self.fusion_conv_channel_ratio  #4
+        dim_inner = self.width_per_group * self.num_groups  #64
+
+        self.s2 = ResStage(dim_in=[
+            self.width_per_group + self.width_per_group // out_dim_ratio,
+            self.width_per_group // self.beta,
+        ],
+                           dim_out=[
+                               self.width_per_group * 4,
+                               self.width_per_group * 4 // self.beta,
+                           ],
+                           dim_inner=[dim_inner, dim_inner // self.beta],
+                           temp_kernel_sizes=temp_kernel[1],
+                           stride=spatial_strides[0],
+                           num_blocks=[d2] * 2,
+                           num_groups=[self.num_groups] * 2,
+                           num_block_temp_kernel=num_block_temp_kernel[0],
+                           dilation=spatial_dilations[0],
+                           norm_module=self.norm_module)
+
+        self.s2_fuse = FuseFastToSlow(
+            dim_in=self.width_per_group * 4 // self.beta,
+            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+            fusion_kernel=self.fusion_kernel_sz,
+            alpha=self.alpha,
+            norm_module=self.norm_module,
+            fuse_bn_relu=self.fuse_bn_relu,
+        )
+
+        self.s3 = ResStage(
+            dim_in=[
+                self.width_per_group * 4 +
+                self.width_per_group * 4 // out_dim_ratio,
+                self.width_per_group * 4 // self.beta,
+            ],
+            dim_out=[
+                self.width_per_group * 8,
+                self.width_per_group * 8 // self.beta,
+            ],
+            dim_inner=[dim_inner * 2, dim_inner * 2 // self.beta],
+            temp_kernel_sizes=temp_kernel[2],
+            stride=spatial_strides[1],
+            num_blocks=[d3] * 2,
+            num_groups=[self.num_groups] * 2,
+            num_block_temp_kernel=num_block_temp_kernel[1],
+            dilation=spatial_dilations[1],
+            norm_module=self.norm_module,
+        )
+
+        self.s3_fuse = FuseFastToSlow(
+            dim_in=self.width_per_group * 8 // self.beta,
+            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+            fusion_kernel=self.fusion_kernel_sz,
+            alpha=self.alpha,
+            norm_module=self.norm_module,
+            fuse_bn_relu=self.fuse_bn_relu,
+        )
+
+        self.s4 = ResStage(
+            dim_in=[
+                self.width_per_group * 8 +
+                self.width_per_group * 8 // out_dim_ratio,
+                self.width_per_group * 8 // self.beta,
+            ],
+            dim_out=[
+                self.width_per_group * 16,
+                self.width_per_group * 16 // self.beta,
+            ],
+            dim_inner=[dim_inner * 4, dim_inner * 4 // self.beta],
+            temp_kernel_sizes=temp_kernel[3],
+            stride=spatial_strides[2],
+            num_blocks=[d4] * 2,
+            num_groups=[self.num_groups] * 2,
+            num_block_temp_kernel=num_block_temp_kernel[2],
+            dilation=spatial_dilations[2],
+            norm_module=self.norm_module,
+        )
+
+        self.s4_fuse = FuseFastToSlow(
+            dim_in=self.width_per_group * 16 // self.beta,
+            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+            fusion_kernel=self.fusion_kernel_sz,
+            alpha=self.alpha,
+            norm_module=self.norm_module,
+            fuse_bn_relu=self.fuse_bn_relu,
+        )
+
+        self.s5 = ResStage(
+            dim_in=[
+                self.width_per_group * 16 +
+                self.width_per_group * 16 // out_dim_ratio,
+                self.width_per_group * 16 // self.beta,
+            ],
+            dim_out=[
+                self.width_per_group * 32,
+                self.width_per_group * 32 // self.beta,
+            ],
+            dim_inner=[dim_inner * 8, dim_inner * 8 // self.beta],
+            temp_kernel_sizes=temp_kernel[4],
+            stride=spatial_strides[3],
+            num_blocks=[d5] * 2,
+            num_groups=[self.num_groups] * 2,
+            num_block_temp_kernel=num_block_temp_kernel[3],
+            dilation=spatial_dilations[3],
+            norm_module=self.norm_module,
+        )
+
+    def init_weights(self):
+        pass
+
+    def forward(self, x):
+        x = self.s1(x)  #VideoModelStem
+        x = self.s1_fuse(x)  #FuseFastToSlow
+        x = self.s2(x)  #ResStage
+        x = self.s2_fuse(x)
+
+        #  TODO: For AVA, set use_pool_af_s2=1, check mAP's improve.
+        if self.use_pool_af_s2:
+            for pathway in range(self.num_pathways):
+                x[pathway] = F.max_pool3d(x=x[pathway],
+                                          kernel_size=self.pool_size_ratio[pathway],
+                                          stride=self.pool_size_ratio[pathway],
+                                          padding=[0, 0, 0],
+                                          data_format="NCDHW")
+
+        x = self.s3(x)
+        x = self.s3_fuse(x)
+        x = self.s4(x)
+        x = self.s4_fuse(x)
+        x = self.s5(x)
+        return x
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py
new file mode 100644
index 0000000..d348d45
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py
@@ -0,0 +1,796 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+from paddle.nn.initializer import KaimingNormal
+from ..registry import BACKBONES
+from paddlevideo.utils.multigrid import get_norm
+import sys
+import numpy as np
+import paddle.distributed as dist
+
+# seed random seed
+paddle.framework.seed(0)
+
+
+# get init parameters for conv layer
+def get_conv_init(fan_out):
+    return KaimingNormal(fan_in=fan_out)
+
+
+def get_bn_param_attr(bn_weight=1.0, coeff=0.0):
+    param_attr = paddle.ParamAttr(
+        initializer=paddle.nn.initializer.Constant(bn_weight),
+        regularizer=paddle.regularizer.L2Decay(coeff))
+    return param_attr
+
+
+"""Video models."""
+
+
+class BottleneckTransform(paddle.nn.Layer):
+    """
+    Bottleneck transformation: Tx1x1, 1x3x3, 1x1x1, where T is the size of
+        temporal kernel.
+    """
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 temp_kernel_size,
+                 stride,
+                 dim_inner,
+                 num_groups,
+                 stride_1x1=False,
+                 inplace_relu=True,
+                 eps=1e-5,
+                 dilation=1,
+                 norm_module=paddle.nn.BatchNorm3D):
+        """
+        Args:
+            dim_in (int): the channel dimensions of the input.
+            dim_out (int): the channel dimension of the output.
+            temp_kernel_size (int): the temporal kernel sizes of the middle
+                convolution in the bottleneck.
+            stride (int): the stride of the bottleneck.
+            dim_inner (int): the inner dimension of the block.
+            num_groups (int): number of groups for the convolution. num_groups=1
+                is for standard ResNet like networks, and num_groups>1 is for
+                ResNeXt like networks.
+            stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise
+                apply stride to the 3x3 conv.
+            inplace_relu (bool): if True, calculate the relu on the original
+                input without allocating new memory.
+            eps (float): epsilon for batch norm.
+            dilation (int): size of dilation.
+        """
+        super(BottleneckTransform, self).__init__()
+        self.temp_kernel_size = temp_kernel_size
+        self._inplace_relu = inplace_relu
+        self._eps = eps
+        self._stride_1x1 = stride_1x1
+        self.norm_module = norm_module
+        self._construct(dim_in, dim_out, stride, dim_inner, num_groups,
+                        dilation)
+
+    def _construct(self, dim_in, dim_out, stride, dim_inner, num_groups,
+                   dilation):
+        str1x1, str3x3 = (stride, 1) if self._stride_1x1 else (1, stride)
+
+        fan = (dim_inner) * (self.temp_kernel_size * 1 * 1)
+        initializer_tmp = get_conv_init(fan)
+
+        self.a = paddle.nn.Conv3D(
+            in_channels=dim_in,
+            out_channels=dim_inner,
+            kernel_size=[self.temp_kernel_size, 1, 1],
+            stride=[1, str1x1, str1x1],
+            padding=[int(self.temp_kernel_size // 2), 0, 0],
+            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+            bias_attr=False)
+        self.a_bn = self.norm_module(num_features=dim_inner,
+                                     epsilon=self._eps,
+                                     weight_attr=get_bn_param_attr(),
+                                     bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+        # 1x3x3, BN, ReLU.
+        fan = (dim_inner) * (1 * 3 * 3)
+        initializer_tmp = get_conv_init(fan)
+
+        self.b = paddle.nn.Conv3D(
+            in_channels=dim_inner,
+            out_channels=dim_inner,
+            kernel_size=[1, 3, 3],
+            stride=[1, str3x3, str3x3],
+            padding=[0, dilation, dilation],
+            groups=num_groups,
+            dilation=[1, dilation, dilation],
+            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+            bias_attr=False)
+        self.b_bn = self.norm_module(num_features=dim_inner,
+                                     epsilon=self._eps,
+                                     weight_attr=get_bn_param_attr(),
+                                     bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+        # 1x1x1, BN.
+        fan = (dim_out) * (1 * 1 * 1)
+        initializer_tmp = get_conv_init(fan)
+
+        self.c = paddle.nn.Conv3D(
+            in_channels=dim_inner,
+            out_channels=dim_out,
+            kernel_size=[1, 1, 1],
+            stride=[1, 1, 1],
+            padding=[0, 0, 0],
+            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+            bias_attr=False)
+        self.c_bn = self.norm_module(
+            num_features=dim_out,
+            epsilon=self._eps,
+            weight_attr=get_bn_param_attr(bn_weight=0.0),
+            bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+    def forward(self, x):
+        # Branch2a.
+        x = self.a(x)
+        x = self.a_bn(x)
+        x = F.relu(x)
+
+        # Branch2b.
+        x = self.b(x)
+        x = self.b_bn(x)
+        x = F.relu(x)
+
+        # Branch2c
+        x = self.c(x)
+        x = self.c_bn(x)
+        return x
+
+
+class ResBlock(paddle.nn.Layer):
+    """
+    Residual block.
+    """
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 temp_kernel_size,
+                 stride,
+                 dim_inner,
+                 num_groups=1,
+                 stride_1x1=False,
+                 inplace_relu=True,
+                 eps=1e-5,
+                 dilation=1,
+                 norm_module=paddle.nn.BatchNorm3D):
+        """
+        ResBlock class constructs redisual blocks. More details can be found in:
+            Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun.
+            "Deep residual learning for image recognition."
+            https://arxiv.org/abs/1512.03385
+        Args:
+            dim_in (int): the channel dimensions of the input.
+            dim_out (int): the channel dimension of the output.
+            temp_kernel_size (int): the temporal kernel sizes of the middle
+                convolution in the bottleneck.
+            stride (int): the stride of the bottleneck.
+            trans_func (string): transform function to be used to construct the
+                bottleneck.
+            dim_inner (int): the inner dimension of the block.
+            num_groups (int): number of groups for the convolution. num_groups=1
+                is for standard ResNet like networks, and num_groups>1 is for
+                ResNeXt like networks.
+            stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise
+                apply stride to the 3x3 conv.
+            inplace_relu (bool): calculate the relu on the original input
+                without allocating new memory.
+            eps (float): epsilon for batch norm.
+            dilation (int): size of dilation.
+        """
+        super(ResBlock, self).__init__()
+        self._inplace_relu = inplace_relu
+        self._eps = eps
+        self.norm_module = norm_module
+        self._construct(
+            dim_in,
+            dim_out,
+            temp_kernel_size,
+            stride,
+            dim_inner,
+            num_groups,
+            stride_1x1,
+            inplace_relu,
+            dilation,
+        )
+
+    def _construct(
+        self,
+        dim_in,
+        dim_out,
+        temp_kernel_size,
+        stride,
+        dim_inner,
+        num_groups,
+        stride_1x1,
+        inplace_relu,
+        dilation,
+    ):
+        # Use skip connection with projection if dim or res change.
+        if (dim_in != dim_out) or (stride != 1):
+            fan = (dim_out) * (1 * 1 * 1)
+            initializer_tmp = get_conv_init(fan)
+            self.branch1 = paddle.nn.Conv3D(
+                in_channels=dim_in,
+                out_channels=dim_out,
+                kernel_size=1,
+                stride=[1, stride, stride],
+                padding=0,
+                weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+                bias_attr=False,
+                dilation=1)
+            self.branch1_bn = self.norm_module(
+                num_features=dim_out,
+                epsilon=self._eps,
+                weight_attr=get_bn_param_attr(),
+                bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+        self.branch2 = BottleneckTransform(dim_in,
+                                           dim_out,
+                                           temp_kernel_size,
+                                           stride,
+                                           dim_inner,
+                                           num_groups,
+                                           stride_1x1=stride_1x1,
+                                           inplace_relu=inplace_relu,
+                                           dilation=dilation,
+                                           norm_module=self.norm_module)
+
+    def forward(self, x):
+        if hasattr(self, "branch1"):
+            x1 = self.branch1(x)
+            x1 = self.branch1_bn(x1)
+            x2 = self.branch2(x)
+            x = paddle.add(x=x1, y=x2)
+        else:
+            x2 = self.branch2(x)
+            x = paddle.add(x=x, y=x2)
+
+        x = F.relu(x)
+        return x
+
+
+class ResStage(paddle.nn.Layer):
+    """
+    Stage of 3D ResNet. It expects to have one or more tensors as input for
+        multi-pathway (SlowFast) cases.  More details can be found here:
+
+        Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.
+        "Slowfast networks for video recognition."
+        https://arxiv.org/pdf/1812.03982.pdf
+    """
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 stride,
+                 temp_kernel_sizes,
+                 num_blocks,
+                 dim_inner,
+                 num_groups,
+                 num_block_temp_kernel,
+                 dilation,
+                 stride_1x1=False,
+                 inplace_relu=True,
+                 norm_module=paddle.nn.BatchNorm3D):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+        ResStage builds p streams, where p can be greater or equal to one.
+        Args:
+            dim_in (list): list of p the channel dimensions of the input.
+                Different channel dimensions control the input dimension of
+                different pathways.
+            dim_out (list): list of p the channel dimensions of the output.
+                Different channel dimensions control the input dimension of
+                different pathways.
+            temp_kernel_sizes (list): list of the p temporal kernel sizes of the
+                convolution in the bottleneck. Different temp_kernel_sizes
+                control different pathway.
+            stride (list): list of the p strides of the bottleneck. Different
+                stride control different pathway.
+            num_blocks (list): list of p numbers of blocks for each of the
+                pathway.
+            dim_inner (list): list of the p inner channel dimensions of the
+                input. Different channel dimensions control the input dimension
+                of different pathways.
+            num_groups (list): list of number of p groups for the convolution.
+                num_groups=1 is for standard ResNet like networks, and
+                num_groups>1 is for ResNeXt like networks.
+            num_block_temp_kernel (list): extent the temp_kernel_sizes to
+                num_block_temp_kernel blocks, then fill temporal kernel size
+                of 1 for the rest of the layers.
+            dilation (list): size of dilation for each pathway.
+        """
+        super(ResStage, self).__init__()
+        assert all((num_block_temp_kernel[i] <= num_blocks[i]
+                    for i in range(len(temp_kernel_sizes))))
+        self.num_blocks = num_blocks
+        self.temp_kernel_sizes = [
+            (temp_kernel_sizes[i] * num_blocks[i])[:num_block_temp_kernel[i]] +
+            [1] * (num_blocks[i] - num_block_temp_kernel[i])
+            for i in range(len(temp_kernel_sizes))
+        ]
+        assert (len({
+            len(dim_in),
+            len(dim_out),
+            len(temp_kernel_sizes),
+            len(stride),
+            len(num_blocks),
+            len(dim_inner),
+            len(num_groups),
+            len(num_block_temp_kernel),
+        }) == 1)
+        self.num_pathways = len(self.num_blocks)
+        self.norm_module = norm_module
+        self._construct(
+            dim_in,
+            dim_out,
+            stride,
+            dim_inner,
+            num_groups,
+            stride_1x1,
+            inplace_relu,
+            dilation,
+        )
+
+    def _construct(
+        self,
+        dim_in,
+        dim_out,
+        stride,
+        dim_inner,
+        num_groups,
+        stride_1x1,
+        inplace_relu,
+        dilation,
+    ):
+
+        for pathway in range(self.num_pathways):
+            for i in range(self.num_blocks[pathway]):
+                res_block = ResBlock(
+                    dim_in[pathway] if i == 0 else dim_out[pathway],
+                    dim_out[pathway],
+                    self.temp_kernel_sizes[pathway][i],
+                    stride[pathway] if i == 0 else 1,
+                    dim_inner[pathway],
+                    num_groups[pathway],
+                    stride_1x1=stride_1x1,
+                    inplace_relu=inplace_relu,
+                    dilation=dilation[pathway],
+                    norm_module=self.norm_module)
+                self.add_sublayer("pathway{}_res{}".format(pathway, i),
+                                  res_block)
+
+    def forward(self, inputs):
+        output = []
+        for pathway in range(self.num_pathways):
+            x = inputs[pathway]
+
+            for i in range(self.num_blocks[pathway]):
+                m = getattr(self, "pathway{}_res{}".format(pathway, i))
+                x = m(x)
+            output.append(x)
+
+        return output
+
+
+class ResNetBasicStem(paddle.nn.Layer):
+    """
+    ResNe(X)t 3D stem module.
+    Performs spatiotemporal Convolution, BN, and Relu following by a
+        spatiotemporal pooling.
+    """
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 kernel,
+                 stride,
+                 padding,
+                 eps=1e-5,
+                 norm_module=paddle.nn.BatchNorm3D):
+        super(ResNetBasicStem, self).__init__()
+        self.kernel = kernel
+        self.stride = stride
+        self.padding = padding
+        self.eps = eps
+        self.norm_module = norm_module
+        self._construct_stem(dim_in, dim_out)
+
+    def _construct_stem(self, dim_in, dim_out):
+        fan = (dim_out) * (self.kernel[0] * self.kernel[1] * self.kernel[2])
+        initializer_tmp = get_conv_init(fan)
+
+        self._conv = paddle.nn.Conv3D(
+            in_channels=dim_in,
+            out_channels=dim_out,
+            kernel_size=self.kernel,
+            stride=self.stride,
+            padding=self.padding,
+            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+            bias_attr=False)
+        self._bn = self.norm_module(num_features=dim_out,
+                                    epsilon=self.eps,
+                                    weight_attr=get_bn_param_attr(),
+                                    bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._bn(x)
+        x = F.relu(x)
+
+        x = F.max_pool3d(x=x,
+                         kernel_size=[1, 3, 3],
+                         stride=[1, 2, 2],
+                         padding=[0, 1, 1],
+                         data_format="NCDHW")
+        return x
+
+
+class VideoModelStem(paddle.nn.Layer):
+    """
+    Video 3D stem module. Provides stem operations of Conv, BN, ReLU, MaxPool
+    on input data tensor for slow and fast pathways.
+    """
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 kernel,
+                 stride,
+                 padding,
+                 eps=1e-5,
+                 norm_module=paddle.nn.BatchNorm3D):
+        """
+        Args:
+            dim_in (list): the list of channel dimensions of the inputs.
+            dim_out (list): the output dimension of the convolution in the stem
+                layer.
+            kernel (list): the kernels' size of the convolutions in the stem
+                layers. Temporal kernel size, height kernel size, width kernel
+                size in order.
+            stride (list): the stride sizes of the convolutions in the stem
+                layer. Temporal kernel stride, height kernel size, width kernel
+                size in order.
+            padding (list): the paddings' sizes of the convolutions in the stem
+                layer. Temporal padding size, height padding size, width padding
+                size in order.
+            eps (float): epsilon for batch norm.
+        """
+        super(VideoModelStem, self).__init__()
+
+        assert (len({
+            len(dim_in),
+            len(dim_out),
+            len(kernel),
+            len(stride),
+            len(padding),
+        }) == 1), "Input pathway dimensions are not consistent."
+        self.num_pathways = len(dim_in)
+        self.kernel = kernel
+        self.stride = stride
+        self.padding = padding
+        self.eps = eps
+        self.norm_module = norm_module
+        self._construct_stem(dim_in, dim_out)
+
+    def _construct_stem(self, dim_in, dim_out):
+        for pathway in range(len(dim_in)):
+            stem = ResNetBasicStem(dim_in[pathway], dim_out[pathway],
+                                   self.kernel[pathway], self.stride[pathway],
+                                   self.padding[pathway], self.eps,
+                                   self.norm_module)
+            self.add_sublayer("pathway{}_stem".format(pathway), stem)
+
+    def forward(self, x):
+        assert (len(x) == self.num_pathways
+                ), "Input tensor does not contain {} pathway".format(
+                    self.num_pathways)
+
+        for pathway in range(len(x)):
+            m = getattr(self, "pathway{}_stem".format(pathway))
+            x[pathway] = m(x[pathway])
+
+        return x
+
+
+class FuseFastToSlow(paddle.nn.Layer):
+    """
+    Fuses the information from the Fast pathway to the Slow pathway. Given the
+    tensors from Slow pathway and Fast pathway, fuse information from Fast to
+    Slow, then return the fused tensors from Slow and Fast pathway in order.
+    """
+    def __init__(self,
+                 dim_in,
+                 fusion_conv_channel_ratio,
+                 fusion_kernel,
+                 alpha,
+                 fuse_bn_relu=1,
+                 eps=1e-5,
+                 norm_module=paddle.nn.BatchNorm3D):
+        """
+        Args:
+            dim_in (int): the channel dimension of the input.
+            fusion_conv_channel_ratio (int): channel ratio for the convolution
+                used to fuse from Fast pathway to Slow pathway.
+            fusion_kernel (int): kernel size of the convolution used to fuse
+                from Fast pathway to Slow pathway.
+            alpha (int): the frame rate ratio between the Fast and Slow pathway.
+            eps (float): epsilon for batch norm.
+        """
+        super(FuseFastToSlow, self).__init__()
+        self.fuse_bn_relu = fuse_bn_relu
+        fan = (dim_in * fusion_conv_channel_ratio) * (fusion_kernel * 1 * 1)
+        initializer_tmp = get_conv_init(fan)
+
+        self._conv_f2s = paddle.nn.Conv3D(
+            in_channels=dim_in,
+            out_channels=dim_in * fusion_conv_channel_ratio,
+            kernel_size=[fusion_kernel, 1, 1],
+            stride=[alpha, 1, 1],
+            padding=[fusion_kernel // 2, 0, 0],
+            weight_attr=paddle.ParamAttr(initializer=initializer_tmp),
+            bias_attr=False)
+        self._bn = norm_module(num_features=dim_in * fusion_conv_channel_ratio,
+                               epsilon=eps,
+                               weight_attr=get_bn_param_attr(),
+                               bias_attr=get_bn_param_attr(bn_weight=0.0))
+
+    def forward(self, x):
+        x_s = x[0]
+        x_f = x[1]
+        fuse = self._conv_f2s(x_f)
+        #  TODO: For AVA, set fuse_bn_relu=1, check mAP's improve.
+        if self.fuse_bn_relu:
+            fuse = self._bn(fuse)
+            fuse = F.relu(fuse)
+        x_s_fuse = paddle.concat(x=[x_s, fuse], axis=1, name=None)
+
+        return [x_s_fuse, x_f]
+
+
+@BACKBONES.register()
+class ResNetSlowFast_MRI(paddle.nn.Layer):
+    """
+    SlowFast model builder for SlowFast network.
+
+    Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.
+    "Slowfast networks for video recognition."
+    https://arxiv.org/pdf/1812.03982.pdf
+    """
+    def __init__(
+        self,
+        alpha,
+        beta,
+        bn_norm_type="batchnorm",
+        bn_num_splits=1,
+        num_pathways=2,
+        depth=50,
+        num_groups=1,
+        input_channel_num=[1, 1],
+        width_per_group=64,
+        fusion_conv_channel_ratio=2,
+        fusion_kernel_sz=7,  #5?
+        pool_size_ratio=[[1, 1, 1], [1, 1, 1]],
+        fuse_bn_relu=1,
+        spatial_strides=[[1, 1], [2, 2], [2, 2], [2, 2]],
+        use_pool_af_s2=1,
+    ):
+        """
+        Args:
+            cfg (CfgNode): model building configs, details are in the
+                comments of the config file.
+        """
+        super(ResNetSlowFast_MRI, self).__init__()
+
+        self.alpha = alpha  #8
+        self.beta = beta  #8
+        self.norm_module = get_norm(bn_norm_type, bn_num_splits)
+        self.num_pathways = num_pathways
+        self.depth = depth
+        self.num_groups = num_groups
+        self.input_channel_num = input_channel_num
+        self.width_per_group = width_per_group
+        self.fusion_conv_channel_ratio = fusion_conv_channel_ratio
+        self.fusion_kernel_sz = fusion_kernel_sz  # NOTE: modify to 7 in 8*8, 5 in old implement
+        self.pool_size_ratio = pool_size_ratio
+        self.fuse_bn_relu = fuse_bn_relu
+        self.spatial_strides = spatial_strides
+        self.use_pool_af_s2 = use_pool_af_s2
+        self._construct_network()
+
+    def _construct_network(self):
+        """
+        Builds a SlowFast model.
+        The first pathway is the Slow pathway
+        and the second pathway is the Fast pathway.
+
+        Args:
+            cfg (CfgNode): model building configs, details are in the
+                comments of the config file.
+        """
+        temp_kernel = [
+            [[1], [5]],  # conv1 temporal kernel for slow and fast pathway.
+            [[1], [3]],  # res2 temporal kernel for slow and fast pathway.
+            [[1], [3]],  # res3 temporal kernel for slow and fast pathway.
+            [[3], [3]],  # res4 temporal kernel for slow and fast pathway.
+            [[3], [3]],
+        ]  # res5 temporal kernel for slow and fast pathway.
+
+        self.s1 = VideoModelStem(
+            dim_in=self.input_channel_num,
+            dim_out=[self.width_per_group, self.width_per_group // self.beta],
+            kernel=[temp_kernel[0][0] + [7, 7], temp_kernel[0][1] + [7, 7]],
+            stride=[[1, 2, 2]] * 2,
+            padding=[
+                [temp_kernel[0][0][0] // 2, 3, 3],
+                [temp_kernel[0][1][0] // 2, 3, 3],
+            ],
+            norm_module=self.norm_module)
+        self.s1_fuse = FuseFastToSlow(
+            dim_in=self.width_per_group // self.beta,
+            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+            fusion_kernel=self.fusion_kernel_sz,
+            alpha=self.alpha,
+            norm_module=self.norm_module,
+            fuse_bn_relu=self.fuse_bn_relu)
+
+        # ResNet backbone
+        MODEL_STAGE_DEPTH = {50: (3, 4, 6, 3)}
+        (d2, d3, d4, d5) = MODEL_STAGE_DEPTH[self.depth]
+
+        num_block_temp_kernel = [[3, 3], [4, 4], [6, 6], [3, 3]]
+        spatial_dilations = [[1, 1], [1, 1], [1, 1], [1, 1]]
+        spatial_strides = self.spatial_strides
+        #spatial_strides = [[1, 1], [2, 2], [2, 2], [2, 2]]
+        #spatial_strides = [[1, 1], [2, 2], [2, 2], [1, 1]] #TODO:check which value is FAIR's impliment
+
+        out_dim_ratio = self.beta // self.fusion_conv_channel_ratio  #4
+        dim_inner = self.width_per_group * self.num_groups  #64
+
+        self.s2 = ResStage(dim_in=[
+            self.width_per_group + self.width_per_group // out_dim_ratio,
+            self.width_per_group // self.beta,
+        ],
+                           dim_out=[
+                               self.width_per_group * 4,
+                               self.width_per_group * 4 // self.beta,
+                           ],
+                           dim_inner=[dim_inner, dim_inner // self.beta],
+                           temp_kernel_sizes=temp_kernel[1],
+                           stride=spatial_strides[0],
+                           num_blocks=[d2] * 2,
+                           num_groups=[self.num_groups] * 2,
+                           num_block_temp_kernel=num_block_temp_kernel[0],
+                           dilation=spatial_dilations[0],
+                           norm_module=self.norm_module)
+
+        self.s2_fuse = FuseFastToSlow(
+            dim_in=self.width_per_group * 4 // self.beta,
+            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+            fusion_kernel=self.fusion_kernel_sz,
+            alpha=self.alpha,
+            norm_module=self.norm_module,
+            fuse_bn_relu=self.fuse_bn_relu,
+        )
+
+        self.s3 = ResStage(
+            dim_in=[
+                self.width_per_group * 4 +
+                self.width_per_group * 4 // out_dim_ratio,
+                self.width_per_group * 4 // self.beta,
+            ],
+            dim_out=[
+                self.width_per_group * 8,
+                self.width_per_group * 8 // self.beta,
+            ],
+            dim_inner=[dim_inner * 2, dim_inner * 2 // self.beta],
+            temp_kernel_sizes=temp_kernel[2],
+            stride=spatial_strides[1],
+            num_blocks=[d3] * 2,
+            num_groups=[self.num_groups] * 2,
+            num_block_temp_kernel=num_block_temp_kernel[1],
+            dilation=spatial_dilations[1],
+            norm_module=self.norm_module,
+        )
+
+        self.s3_fuse = FuseFastToSlow(
+            dim_in=self.width_per_group * 8 // self.beta,
+            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+            fusion_kernel=self.fusion_kernel_sz,
+            alpha=self.alpha,
+            norm_module=self.norm_module,
+            fuse_bn_relu=self.fuse_bn_relu,
+        )
+
+        self.s4 = ResStage(
+            dim_in=[
+                self.width_per_group * 8 +
+                self.width_per_group * 8 // out_dim_ratio,
+                self.width_per_group * 8 // self.beta,
+            ],
+            dim_out=[
+                self.width_per_group * 16,
+                self.width_per_group * 16 // self.beta,
+            ],
+            dim_inner=[dim_inner * 4, dim_inner * 4 // self.beta],
+            temp_kernel_sizes=temp_kernel[3],
+            stride=spatial_strides[2],
+            num_blocks=[d4] * 2,
+            num_groups=[self.num_groups] * 2,
+            num_block_temp_kernel=num_block_temp_kernel[2],
+            dilation=spatial_dilations[2],
+            norm_module=self.norm_module,
+        )
+
+        self.s4_fuse = FuseFastToSlow(
+            dim_in=self.width_per_group * 16 // self.beta,
+            fusion_conv_channel_ratio=self.fusion_conv_channel_ratio,
+            fusion_kernel=self.fusion_kernel_sz,
+            alpha=self.alpha,
+            norm_module=self.norm_module,
+            fuse_bn_relu=self.fuse_bn_relu,
+        )
+
+        self.s5 = ResStage(
+            dim_in=[
+                self.width_per_group * 16 +
+                self.width_per_group * 16 // out_dim_ratio,
+                self.width_per_group * 16 // self.beta,
+            ],
+            dim_out=[
+                self.width_per_group * 32,
+                self.width_per_group * 32 // self.beta,
+            ],
+            dim_inner=[dim_inner * 8, dim_inner * 8 // self.beta],
+            temp_kernel_sizes=temp_kernel[4],
+            stride=spatial_strides[3],
+            num_blocks=[d5] * 2,
+            num_groups=[self.num_groups] * 2,
+            num_block_temp_kernel=num_block_temp_kernel[3],
+            dilation=spatial_dilations[3],
+            norm_module=self.norm_module,
+        )
+
+    def init_weights(self):
+        pass
+
+    def forward(self, x):
+        x = self.s1(x)  #VideoModelStem
+        x = self.s1_fuse(x)  #FuseFastToSlow
+        x = self.s2(x)  #ResStage
+        x = self.s2_fuse(x)
+
+        #  TODO: For AVA, set use_pool_af_s2=1, check mAP's improve.
+        if self.use_pool_af_s2:
+            for pathway in range(self.num_pathways):
+                x[pathway] = F.max_pool3d(
+                    x=x[pathway],
+                    kernel_size=self.pool_size_ratio[pathway],
+                    stride=self.pool_size_ratio[pathway],
+                    padding=[0, 0, 0],
+                    data_format="NCDHW")
+
+        x = self.s3(x)
+        x = self.s3_fuse(x)
+        x = self.s4(x)
+        x = self.s4_fuse(x)
+        x = self.s5(x)
+        return x
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tsm.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tsm.py
new file mode 100644
index 0000000..70788ec
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tsm.py
@@ -0,0 +1,353 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,
+                       AvgPool2D)
+from paddle import ParamAttr
+import paddle.nn.functional as F
+from paddle.regularizer import L2Decay
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils import load_ckpt
+
+
+class ConvBNLayer(nn.Layer):
+    """Conv2D and BatchNorm2D layer.
+
+    Args:
+        in_channels (int): Number of channels for the input.
+        out_channels (int): Number of channels for the output.
+        kernel_size (int): Kernel size.
+        stride (int): Stride in the Conv2D layer. Default: 1.
+        groups (int): Groups in the Conv2D, Default: 1.
+        act (str): Indicate activation after BatchNorm2D layer.
+        name (str): the name of an instance of ConvBNLayer.
+    Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method.
+
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None,
+                 data_format="NCHW"):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(in_channels=in_channels,
+                            out_channels=out_channels,
+                            kernel_size=kernel_size,
+                            stride=stride,
+                            padding=(kernel_size - 1) // 2,
+                            groups=groups,
+                            weight_attr=ParamAttr(name=name + "_weights"),
+                            bias_attr=False,
+                            data_format=data_format)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+
+        self._act = act
+
+        self._batch_norm = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(name=bn_name + "_scale",
+                                  regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(name=bn_name + "_offset",
+                                regularizer=L2Decay(0.0)),
+            data_format=data_format)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if self._act:
+            y = getattr(paddle.nn.functional, self._act)(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 num_seg=8,
+                 name=None,
+                 data_format="NCHW"):
+        super(BottleneckBlock, self).__init__()
+        self.data_format = data_format
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=1,
+                                 act="relu",
+                                 name=name + "_branch2a",
+                                 data_format=data_format)
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 stride=stride,
+                                 act="relu",
+                                 name=name + "_branch2b",
+                                 data_format=data_format)
+
+        self.conv2 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels * 4,
+                                 kernel_size=1,
+                                 act=None,
+                                 name=name + "_branch2c",
+                                 data_format=data_format)
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels * 4,
+                                     kernel_size=1,
+                                     stride=stride,
+                                     name=name + "_branch1",
+                                     data_format=data_format)
+
+        self.shortcut = shortcut
+        self.num_seg = num_seg
+
+    def forward(self, inputs):
+        if paddle.is_compiled_with_custom_device('npu'):
+            x = inputs
+            seg_num = self.num_seg
+            shift_ratio = 1.0 / self.num_seg
+
+            shape = x.shape  #[N*T, C, H, W]
+            reshape_x = x.reshape(
+                (-1, seg_num, shape[1], shape[2], shape[3]))  #[N, T, C, H, W]
+            pad_x = F.pad(reshape_x, [
+                0,
+                0,
+                1,
+                1,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            ])  #[N, T+2, C, H, W]
+            c1 = int(shape[1] * shift_ratio)
+            c2 = int(shape[1] * 2 * shift_ratio)
+            slice1 = pad_x[:, :seg_num, :c1, :, :]
+            slice2 = pad_x[:, 2:seg_num + 2, c1:c2, :, :]
+            slice3 = pad_x[:, 1:seg_num + 1, c2:, :, :]
+            concat_x = paddle.concat([slice1, slice2, slice3],
+                                     axis=2)  #[N, T, C, H, W]
+            shifts = concat_x.reshape(shape)
+        else:
+            shifts = F.temporal_shift(inputs,
+                                      self.num_seg,
+                                      1.0 / self.num_seg,
+                                      data_format=self.data_format)
+
+        y = self.conv0(shifts)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        return F.relu(y)
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 name=None,
+                 data_format="NCHW"):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            name=name + "_branch2a",
+            data_format=data_format,
+        )
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            filter_size=3,
+            act=None,
+            name=name + "_branch2b",
+            data_format=data_format,
+        )
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                filter_size=1,
+                stride=stride,
+                name=name + "_branch1",
+                data_format=data_format,
+            )
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(short, conv1)
+        y = F.relu(y)
+        return y
+
+
+@BACKBONES.register()
+class ResNetTSM(nn.Layer):
+    """ResNet TSM backbone.
+
+    Args:
+        depth (int): Depth of resnet model.
+        pretrained (str): pretrained model. Default: None.
+    """
+    def __init__(self, depth, num_seg=8, data_format="NCHW", pretrained=None):
+        super(ResNetTSM, self).__init__()
+        self.pretrained = pretrained
+        self.layers = depth
+        self.num_seg = num_seg
+        self.data_format = data_format
+
+        supported_layers = [18, 34, 50, 101, 152]
+        assert self.layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, self.layers)
+
+        if self.layers == 18:
+            depth = [2, 2, 2, 2]
+        elif self.layers == 34 or self.layers == 50:
+            depth = [3, 4, 6, 3]
+        elif self.layers == 101:
+            depth = [3, 4, 23, 3]
+        elif self.layers == 152:
+            depth = [3, 8, 36, 3]
+
+        in_channels = 64
+        out_channels = [64, 128, 256, 512]
+
+        self.conv = ConvBNLayer(in_channels=3,
+                                out_channels=64,
+                                kernel_size=7,
+                                stride=2,
+                                act="relu",
+                                name="conv1",
+                                data_format=self.data_format)
+        self.pool2D_max = MaxPool2D(
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            data_format=self.data_format,
+        )
+
+        self.block_list = []
+        if self.layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if self.layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        conv_name,
+                        BottleneckBlock(
+                            in_channels=in_channels
+                            if i == 0 else out_channels[block] * 4,
+                            out_channels=out_channels[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            num_seg=self.num_seg,
+                            shortcut=shortcut,
+                            name=conv_name,
+                            data_format=self.data_format))
+                    in_channels = out_channels[block] * 4
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        conv_name,
+                        BasicBlock(
+                            in_channels=in_channels[block]
+                            if i == 0 else out_channels[block],
+                            out_channels=out_channels[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            name=conv_name,
+                            data_format=self.data_format,
+                        ))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+    def init_weights(self):
+        """Initiate the parameters.
+        Note:
+            1. when indicate pretrained loading path, will load it to initiate backbone.
+            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.
+            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html
+        """
+        #XXX: check bias!!! check pretrained!!!
+
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            for layer in self.sublayers():
+                if isinstance(layer, nn.Conv2D):
+                    #XXX: no bias
+                    weight_init_(layer, 'KaimingNormal')
+                elif isinstance(layer, nn.BatchNorm2D):
+                    weight_init_(layer, 'Constant', value=1)
+
+    def forward(self, inputs):
+        """Define how the backbone is going to run.
+
+        """
+        #NOTE: (deprecated design) Already merge axis 0(batches) and axis 1(clips) before extracting feature phase,
+        # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27
+        #y = paddle.reshape(
+        #    inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])
+
+        #NOTE: As paddlepaddle to_static method need a "pure" model to trim. It means from
+        #  1. the phase of generating data[images, label] from dataloader
+        #     to
+        #  2. last layer of a model, always is FC layer
+
+        y = self.conv(inputs)
+        y = self.pool2D_max(y)
+        for block in self.block_list:
+            y = block(y)
+        return y
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tsm_MRI.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tsm_MRI.py
new file mode 100644
index 0000000..e814f0f
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tsm_MRI.py
@@ -0,0 +1,327 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import math
+
+import sys
+import paddle
+import paddle.nn as nn
+from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,
+                       AvgPool2D)
+from paddle import ParamAttr
+import paddle.nn.functional as F
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils.save_load import load_ckpt
+from paddle.regularizer import L2Decay
+
+
+class ConvBNLayer(nn.Layer):
+    """Conv2D and BatchNorm2D layer.
+
+    Args:
+        in_channels (int): Number of channels for the input.
+        out_channels (int): Number of channels for the output.
+        kernel_size (int): Kernel size.
+        stride (int): Stride in the Conv2D layer. Default: 1.
+        groups (int): Groups in the Conv2D, Default: 1.
+        is_tweaks_mode (bool): switch for tweaks. Default: False.
+        act (str): Indicate activation after BatchNorm2D layer.
+        name (str): the name of an instance of ConvBNLayer.
+
+    Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method.
+
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 is_tweaks_mode=False,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.is_tweaks_mode = is_tweaks_mode
+        #ResNet-D 1/2:add a 2×2 average pooling layer with a stride of 2 before the convolution,
+        #             whose stride is changed to 1, works well in practice.
+        self._pool2d_avg = AvgPool2D(kernel_size=2,
+                                     stride=2,
+                                     padding=0,
+                                     ceil_mode=True)
+
+        self._conv = Conv2D(in_channels=in_channels,
+                            out_channels=out_channels,
+                            kernel_size=kernel_size,
+                            stride=stride,
+                            padding=(kernel_size - 1) // 2,
+                            groups=groups,
+                            weight_attr=ParamAttr(name=name + "_weights"),
+                            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+
+        self._act = act
+
+        self._batch_norm = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(name=bn_name + "_scale",
+                                  regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(bn_name + "_offset", regularizer=L2Decay(0.0)))
+
+    def forward(self, inputs):
+        if self.is_tweaks_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if self._act:
+            y = getattr(paddle.nn.functional, self._act)(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 num_seg=8,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=1,
+                                 act="leaky_relu",
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 stride=stride,
+                                 act="leaky_relu",
+                                 name=name + "_branch2b")
+
+        self.conv2 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels * 4,
+                                 kernel_size=1,
+                                 act=None,
+                                 name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=out_channels * 4,
+                kernel_size=1,
+                stride=
+                1,  #ResNet-D 2/2:add a 2×2 average pooling layer with a stride of 2 before the convolution,
+                #             whose stride is changed to 1, works well in practice.
+                is_tweaks_mode=False if if_first else True,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+        self.num_seg = num_seg
+
+    def forward(self, inputs):
+        shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg)
+        y = self.conv0(shifts)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        return F.leaky_relu(y)
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 filter_size=3,
+                                 stride=stride,
+                                 act="leaky_relu",
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 filter_size=3,
+                                 act=None,
+                                 name=name + "_branch2b")
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels,
+                                     filter_size=1,
+                                     stride=stride,
+                                     name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(short, conv1)
+        y = F.leaky_relu(y)
+        return y
+
+
+@BACKBONES.register()
+class ResNetTSM_MRI(nn.Layer):
+    """ResNet TSM backbone.
+
+    Args:
+        depth (int): Depth of resnet model.
+        pretrained (str): pretrained model. Default: None.
+    """
+    def __init__(self, depth, num_seg=8, pretrained=None, in_channels=1):
+        super(ResNetTSM_MRI, self).__init__()
+        self.pretrained = pretrained
+        self.layers = depth
+        self.num_seg = num_seg
+        self.in_channels = in_channels
+
+        supported_layers = [18, 34, 50, 101, 152]
+        assert self.layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, self.layers)
+
+        if self.layers == 18:
+            depth = [2, 2, 2, 2]
+        elif self.layers == 34 or self.layers == 50:
+            depth = [3, 4, 6, 3]
+        elif self.layers == 101:
+            depth = [3, 4, 23, 3]
+        elif self.layers == 152:
+            depth = [3, 8, 36, 3]
+
+        in_channels = 64
+        out_channels = [64, 128, 256, 512]
+
+        #ResNet-C: use three 3x3 conv, replace, one 7x7 conv
+        self.conv1_1 = ConvBNLayer(in_channels=self.in_channels,
+                                   out_channels=32,
+                                   kernel_size=3,
+                                   stride=2,
+                                   act='leaky_relu',
+                                   name="conv1_1")
+        self.conv1_2 = ConvBNLayer(in_channels=32,
+                                   out_channels=32,
+                                   kernel_size=3,
+                                   stride=1,
+                                   act='leaky_relu',
+                                   name="conv1_2")
+        self.conv1_3 = ConvBNLayer(in_channels=32,
+                                   out_channels=64,
+                                   kernel_size=3,
+                                   stride=1,
+                                   act='leaky_relu',
+                                   name="conv1_3")
+        self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        if self.layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if self.layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' %
+                        (block, i),  #same with PaddleClas, for loading pretrain
+                        BottleneckBlock(
+                            in_channels=in_channels
+                            if i == 0 else out_channels[block] * 4,
+                            out_channels=out_channels[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            num_seg=self.num_seg,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            name=conv_name))
+                    in_channels = out_channels[block] * 4
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        conv_name,
+                        BasicBlock(in_channels=in_channels[block]
+                                   if i == 0 else out_channels[block],
+                                   out_channels=out_channels[block],
+                                   stride=2 if i == 0 and block != 0 else 1,
+                                   shortcut=shortcut,
+                                   name=conv_name))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+    def init_weights(self):
+        """Initiate the parameters.
+        Note:
+            1. when indicate pretrained loading path, will load it to initiate backbone.
+            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.
+            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html
+        """
+        #XXX: check bias!!! check pretrained!!!
+
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            for layer in self.sublayers():
+                if isinstance(layer, nn.Conv2D):
+                    #XXX: no bias
+                    weight_init_(layer, 'KaimingNormal')
+                elif isinstance(layer, nn.BatchNorm2D):
+                    weight_init_(layer, 'Constant', value=1)
+
+    def forward(self, inputs):
+        """Define how the backbone is going to run.
+
+        """
+        #NOTE: Already merge axis 0(batches) and axis 1(channels) before extracting feature phase,
+        # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27
+        #y = paddle.reshape(
+        #    inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]])
+
+        ####ResNet-C: use three 3x3 conv, replace, one 7x7 conv
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+
+        y = self.pool2D_max(y)
+        for block in self.block_list:
+            y = block(y)
+        return y
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tsn_MRI.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tsn_MRI.py
new file mode 100644
index 0000000..439a0ef
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tsn_MRI.py
@@ -0,0 +1,331 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.regularizer import L2Decay
+from paddle.nn import Conv2D, BatchNorm
+from paddle.nn import MaxPool2D, AvgPool2D
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils import load_ckpt
+
+__all__ = ["ResNetTSN_MRI"]
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 is_tweaks_mode=False,
+                 act=None,
+                 lr_mult=1.0,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.is_tweaks_mode = is_tweaks_mode
+        self._pool2d_avg = AvgPool2D(kernel_size=2,
+                                     stride=2,
+                                     padding=0,
+                                     ceil_mode=True)
+        self._conv = Conv2D(in_channels=in_channels,
+                            out_channels=out_channels,
+                            kernel_size=kernel_size,
+                            stride=stride,
+                            padding=(kernel_size - 1) // 2,
+                            groups=groups,
+                            weight_attr=ParamAttr(name=name + "_weights",
+                                                  learning_rate=lr_mult),
+                            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            out_channels,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale',
+                                 learning_rate=lr_mult,
+                                 regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(bn_name + '_offset',
+                                learning_rate=lr_mult,
+                                regularizer=L2Decay(0.0)),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        if self.is_tweaks_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=1,
+                                 act='relu',
+                                 lr_mult=lr_mult,
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 stride=stride,
+                                 act='relu',
+                                 lr_mult=lr_mult,
+                                 name=name + "_branch2b")
+        self.conv2 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels * 4,
+                                 kernel_size=1,
+                                 act=None,
+                                 lr_mult=lr_mult,
+                                 name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels * 4,
+                                     kernel_size=1,
+                                     stride=1,
+                                     is_tweaks_mode=False if if_first else True,
+                                     lr_mult=lr_mult,
+                                     name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 stride=stride,
+                                 act='relu',
+                                 lr_mult=lr_mult,
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 act=None,
+                                 lr_mult=lr_mult,
+                                 name=name + "_branch2b")
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels,
+                                     kernel_size=1,
+                                     stride=1,
+                                     is_tweaks_mode=False if if_first else True,
+                                     lr_mult=lr_mult,
+                                     name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv1)
+        y = F.relu(y)
+        return y
+
+
+@BACKBONES.register()
+class ResNetTSN_MRI(nn.Layer):
+    """ResNetTweaksTSN backbone.
+
+    Args:
+        depth (int): Depth of resnet model.
+        pretrained (str): pretrained model. Default: None.
+    """
+    def __init__(self,
+                 layers=50,
+                 pretrained=None,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 in_channels=1):
+        super(ResNetTSN_MRI, self).__init__()
+
+        self.pretrained = pretrained
+        self.layers = layers
+        supported_layers = [18, 34, 50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        self.lr_mult_list = lr_mult_list
+        self.in_channels = in_channels
+        assert isinstance(
+            self.lr_mult_list,
+            (list, tuple
+             )), "lr_mult_list should be in (list, tuple) but got {}".format(
+                 type(self.lr_mult_list))
+        assert len(
+            self.lr_mult_list
+        ) == 5, "lr_mult_list length should should be 5 but got {}".format(
+            len(self.lr_mult_list))
+
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_channels = [64, 256, 512, 1024
+                        ] if layers >= 50 else [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512]
+
+        self.conv1_1 = ConvBNLayer(in_channels=self.in_channels,
+                                   out_channels=32,
+                                   kernel_size=3,
+                                   stride=2,
+                                   act='relu',
+                                   lr_mult=self.lr_mult_list[0],
+                                   name="conv1_1")
+        self.conv1_2 = ConvBNLayer(in_channels=32,
+                                   out_channels=32,
+                                   kernel_size=3,
+                                   stride=1,
+                                   act='relu',
+                                   lr_mult=self.lr_mult_list[0],
+                                   name="conv1_2")
+        self.conv1_3 = ConvBNLayer(in_channels=32,
+                                   out_channels=64,
+                                   kernel_size=3,
+                                   stride=1,
+                                   act='relu',
+                                   lr_mult=self.lr_mult_list[0],
+                                   name="conv1_3")
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        if layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if layers in [101, 152, 200] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BottleneckBlock(
+                            in_channels=num_channels[block]
+                            if i == 0 else num_filters[block] * 4,
+                            out_channels=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            lr_mult=self.lr_mult_list[block + 1],
+                            name=conv_name))
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BasicBlock(in_channels=num_channels[block]
+                                   if i == 0 else num_filters[block],
+                                   out_channels=num_filters[block],
+                                   stride=2 if i == 0 and block != 0 else 1,
+                                   shortcut=shortcut,
+                                   if_first=block == i == 0,
+                                   name=conv_name,
+                                   lr_mult=self.lr_mult_list[block + 1]))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+    def init_weights(self):
+        """Initiate the parameters.
+        Note:
+            1. when indicate pretrained loading path, will load it to initiate backbone.
+            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be
+            initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.
+            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html
+        """
+        # XXX: check bias!!! check pretrained!!!
+
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            for layer in self.sublayers():
+                if isinstance(layer, nn.Conv2D):
+                    # XXX: no bias
+                    weight_init_(layer, 'KaimingNormal')
+                elif isinstance(layer, nn.BatchNorm2D):
+                    weight_init_(layer, 'Constant', value=1)
+
+    def forward(self, inputs):
+
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        return y
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py
new file mode 100644
index 0000000..089da4e
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py
@@ -0,0 +1,362 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import math
+
+import sys
+import paddle
+import paddle.nn as nn
+from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D,
+                       AvgPool2D)
+from paddle import ParamAttr
+import paddle.nn.functional as F
+from paddle.regularizer import L2Decay
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils.save_load import load_ckpt
+
+# Download URL of pretrained model
+# {
+# "ResNet50_vd":
+# "wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams",
+# "ResNet101_vd":
+# "https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ResNet101_vd_ssld_pretrained.pdparams",
+# "ResNet18_vd":
+# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_vd_pretrained.pdparams",
+# "ResNet34_vd":
+# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet34_vd_ssld_pretrained.pdparams",
+# "ResNet152_vd":
+# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_vd_pretrained.pdparams",
+# "ResNet200_vd":
+# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet200_vd_pretrained.pdparams",
+# }
+
+
+class ConvBNLayer(nn.Layer):
+    """Conv2D and BatchNorm2D layer.
+    Args:
+        in_channels (int): Number of channels for the input.
+        out_channels (int): Number of channels for the output.
+        kernel_size (int): Kernel size.
+        stride (int): Stride in the Conv2D layer. Default: 1.
+        groups (int): Groups in the Conv2D, Default: 1.
+        is_tweaks_mode (bool): switch for tweaks. Default: False.
+        act (str): Indicate activation after BatchNorm2D layer.
+        name (str): the name of an instance of ConvBNLayer.
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 is_tweaks_mode=False,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.is_tweaks_mode = is_tweaks_mode
+        #ResNet-D 1/2:add a 2×2 average pooling layer with a stride of 2 before the convolution,
+        #             whose stride is changed to 1, works well in practice.
+        self._pool2d_avg = AvgPool2D(kernel_size=2,
+                                     stride=2,
+                                     padding=0,
+                                     ceil_mode=True)
+
+        self._conv = Conv2D(in_channels=in_channels,
+                            out_channels=out_channels,
+                            kernel_size=kernel_size,
+                            stride=stride,
+                            padding=(kernel_size - 1) // 2,
+                            groups=groups,
+                            weight_attr=ParamAttr(name=name + "_weights"),
+                            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+
+        self._act = act
+
+        self._batch_norm = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(name=bn_name + "_scale",
+                                  regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(bn_name + "_offset", regularizer=L2Decay(0.0)))
+
+    def forward(self, inputs):
+        if self.is_tweaks_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if self._act:
+            y = getattr(paddle.nn.functional, self._act)(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 num_seg=8,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=1,
+                                 act="leaky_relu",
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 stride=stride,
+                                 act="leaky_relu",
+                                 name=name + "_branch2b")
+
+        self.conv2 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels * 4,
+                                 kernel_size=1,
+                                 act=None,
+                                 name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels * 4,
+                                     kernel_size=1,
+                                     stride=1,
+                                     is_tweaks_mode=False if if_first else True,
+                                     name=name + "_branch1")
+
+        self.shortcut = shortcut
+        self.num_seg = num_seg
+
+    def forward(self, inputs):
+        if paddle.is_compiled_with_custom_device('npu'):
+            x = inputs
+            seg_num = self.num_seg
+            shift_ratio = 1.0 / self.num_seg
+
+            shape = x.shape  #[N*T, C, H, W]
+            reshape_x = x.reshape(
+                (-1, seg_num, shape[1], shape[2], shape[3]))  #[N, T, C, H, W]
+            pad_x = F.pad(reshape_x, [
+                0,
+                0,
+                1,
+                1,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            ])  #[N, T+2, C, H, W]
+            c1 = int(shape[1] * shift_ratio)
+            c2 = int(shape[1] * 2 * shift_ratio)
+            slice1 = pad_x[:, :seg_num, :c1, :, :]
+            slice2 = pad_x[:, 2:seg_num + 2, c1:c2, :, :]
+            slice3 = pad_x[:, 1:seg_num + 1, c2:, :, :]
+            concat_x = paddle.concat([slice1, slice2, slice3],
+                                     axis=2)  #[N, T, C, H, W]
+            shifts = concat_x.reshape(shape)
+        else:
+            shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg)
+
+        y = self.conv0(shifts)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        return F.leaky_relu(y)
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 num_seg=8,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.num_seg = num_seg
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 stride=stride,
+                                 act="leaky_relu",
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 act=None,
+                                 name=name + "_branch2b")
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels,
+                                     kernel_size=1,
+                                     stride=stride,
+                                     name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        # add temporal shift module
+        shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg)
+        y = self.conv0(shifts)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(short, conv1)
+        y = F.leaky_relu(y)
+        return y
+
+
+@BACKBONES.register()
+class ResNetTweaksTSM(nn.Layer):
+    """ResNet TSM backbone.
+
+    Args:
+        depth (int): Depth of resnet model.
+        pretrained (str): pretrained model. Default: None.
+    """
+    def __init__(self, depth, num_seg=8, pretrained=None):
+        super(ResNetTweaksTSM, self).__init__()
+        self.pretrained = pretrained
+        self.layers = depth
+        self.num_seg = num_seg
+
+        supported_layers = [18, 34, 50, 101, 152]
+        assert self.layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, self.layers)
+
+        if self.layers == 18:
+            depth = [2, 2, 2, 2]
+        elif self.layers == 34 or self.layers == 50:
+            depth = [3, 4, 6, 3]
+        elif self.layers == 101:
+            depth = [3, 4, 23, 3]
+        elif self.layers == 152:
+            depth = [3, 8, 36, 3]
+
+        in_channels = 64
+        out_channels = [64, 128, 256, 512]
+
+        #ResNet-C: use three 3x3 conv, replace, one 7x7 conv
+        self.conv1_1 = ConvBNLayer(in_channels=3,
+                                   out_channels=32,
+                                   kernel_size=3,
+                                   stride=2,
+                                   act='leaky_relu',
+                                   name="conv1_1")
+        self.conv1_2 = ConvBNLayer(in_channels=32,
+                                   out_channels=32,
+                                   kernel_size=3,
+                                   stride=1,
+                                   act='leaky_relu',
+                                   name="conv1_2")
+        self.conv1_3 = ConvBNLayer(in_channels=32,
+                                   out_channels=64,
+                                   kernel_size=3,
+                                   stride=1,
+                                   act='leaky_relu',
+                                   name="conv1_3")
+        self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        if self.layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if self.layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' %
+                        (block, i),  #same with PaddleClas, for loading pretrain
+                        BottleneckBlock(
+                            in_channels=in_channels
+                            if i == 0 else out_channels[block] * 4,
+                            out_channels=out_channels[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            num_seg=self.num_seg,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            name=conv_name))
+                    in_channels = out_channels[block] * 4
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            in_channels = [64, 64, 128, 256]
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        conv_name,
+                        BasicBlock(in_channels=in_channels[block]
+                                   if i == 0 else out_channels[block],
+                                   out_channels=out_channels[block],
+                                   stride=2 if i == 0 and block != 0 else 1,
+                                   shortcut=shortcut,
+                                   num_seg=self.num_seg,
+                                   name=conv_name))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+    def init_weights(self):
+        """Initiate the parameters.
+        Note:
+            1. when indicate pretrained loading path, will load it to initiate backbone.
+            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.
+            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html
+        """
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            for layer in self.sublayers():
+                if isinstance(layer, nn.Conv2D):
+                    # no bias
+                    weight_init_(layer, 'KaimingNormal')
+                elif isinstance(layer, nn.BatchNorm2D):
+                    weight_init_(layer, 'Constant', value=1)
+
+    def forward(self, inputs):
+        """Define how the backbone is going to run.
+        """
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+
+        y = self.pool2D_max(y)
+        for block in self.block_list:
+            y = block(y)
+        return y
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py
new file mode 100644
index 0000000..36b3307
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py
@@ -0,0 +1,328 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.regularizer import L2Decay
+from paddle.nn import Conv2D, BatchNorm
+from paddle.nn import MaxPool2D, AvgPool2D
+
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+from ...utils import load_ckpt
+
+__all__ = ["ResNetTweaksTSN"]
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 is_tweaks_mode=False,
+                 act=None,
+                 lr_mult=1.0,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.is_tweaks_mode = is_tweaks_mode
+        self._pool2d_avg = AvgPool2D(kernel_size=2,
+                                     stride=2,
+                                     padding=0,
+                                     ceil_mode=True)
+        self._conv = Conv2D(in_channels=in_channels,
+                            out_channels=out_channels,
+                            kernel_size=kernel_size,
+                            stride=stride,
+                            padding=(kernel_size - 1) // 2,
+                            groups=groups,
+                            weight_attr=ParamAttr(name=name + "_weights",
+                                                  learning_rate=lr_mult),
+                            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            out_channels,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale',
+                                 learning_rate=lr_mult,
+                                 regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(bn_name + '_offset',
+                                learning_rate=lr_mult,
+                                regularizer=L2Decay(0.0)),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        if self.is_tweaks_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=1,
+                                 act='relu',
+                                 lr_mult=lr_mult,
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 stride=stride,
+                                 act='relu',
+                                 lr_mult=lr_mult,
+                                 name=name + "_branch2b")
+        self.conv2 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels * 4,
+                                 kernel_size=1,
+                                 act=None,
+                                 lr_mult=lr_mult,
+                                 name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels * 4,
+                                     kernel_size=1,
+                                     stride=1,
+                                     is_tweaks_mode=False if if_first else True,
+                                     lr_mult=lr_mult,
+                                     name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 stride=stride,
+                                 act='relu',
+                                 lr_mult=lr_mult,
+                                 name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(in_channels=out_channels,
+                                 out_channels=out_channels,
+                                 kernel_size=3,
+                                 act=None,
+                                 lr_mult=lr_mult,
+                                 name=name + "_branch2b")
+
+        if not shortcut:
+            self.short = ConvBNLayer(in_channels=in_channels,
+                                     out_channels=out_channels,
+                                     kernel_size=1,
+                                     stride=1,
+                                     is_tweaks_mode=False if if_first else True,
+                                     lr_mult=lr_mult,
+                                     name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv1)
+        y = F.relu(y)
+        return y
+
+
+@BACKBONES.register()
+class ResNetTweaksTSN(nn.Layer):
+    """ResNetTweaksTSN backbone.
+
+    Args:
+        depth (int): Depth of resnet model.
+        pretrained (str): pretrained model. Default: None.
+    """
+    def __init__(self,
+                 layers=50,
+                 pretrained=None,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0]):
+        super(ResNetTweaksTSN, self).__init__()
+
+        self.pretrained = pretrained
+        self.layers = layers
+        supported_layers = [18, 34, 50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        self.lr_mult_list = lr_mult_list
+        assert isinstance(
+            self.lr_mult_list,
+            (list, tuple
+             )), "lr_mult_list should be in (list, tuple) but got {}".format(
+                 type(self.lr_mult_list))
+        assert len(
+            self.lr_mult_list
+        ) == 5, "lr_mult_list length should should be 5 but got {}".format(
+            len(self.lr_mult_list))
+
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_channels = [64, 256, 512, 1024
+                        ] if layers >= 50 else [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512]
+
+        self.conv1_1 = ConvBNLayer(in_channels=3,
+                                   out_channels=32,
+                                   kernel_size=3,
+                                   stride=2,
+                                   act='relu',
+                                   lr_mult=self.lr_mult_list[0],
+                                   name="conv1_1")
+        self.conv1_2 = ConvBNLayer(in_channels=32,
+                                   out_channels=32,
+                                   kernel_size=3,
+                                   stride=1,
+                                   act='relu',
+                                   lr_mult=self.lr_mult_list[0],
+                                   name="conv1_2")
+        self.conv1_3 = ConvBNLayer(in_channels=32,
+                                   out_channels=64,
+                                   kernel_size=3,
+                                   stride=1,
+                                   act='relu',
+                                   lr_mult=self.lr_mult_list[0],
+                                   name="conv1_3")
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        if layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if layers in [101, 152, 200] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BottleneckBlock(
+                            in_channels=num_channels[block]
+                            if i == 0 else num_filters[block] * 4,
+                            out_channels=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            lr_mult=self.lr_mult_list[block + 1],
+                            name=conv_name))
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BasicBlock(in_channels=num_channels[block]
+                                   if i == 0 else num_filters[block],
+                                   out_channels=num_filters[block],
+                                   stride=2 if i == 0 and block != 0 else 1,
+                                   shortcut=shortcut,
+                                   if_first=block == i == 0,
+                                   name=conv_name,
+                                   lr_mult=self.lr_mult_list[block + 1]))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+    def init_weights(self):
+        """Initiate the parameters.
+        Note:
+            1. when indicate pretrained loading path, will load it to initiate backbone.
+            2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be
+            initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function.
+            Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html
+        """
+        # XXX: check bias!!! check pretrained!!!
+
+        if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            for layer in self.sublayers():
+                if isinstance(layer, nn.Conv2D):
+                    # XXX: no bias
+                    weight_init_(layer, 'KaimingNormal')
+                elif isinstance(layer, nn.BatchNorm2D):
+                    weight_init_(layer, 'Constant', value=1)
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        return y
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnext101.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnext101.py
new file mode 100644
index 0000000..deca671
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnext101.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle import ParamAttr
+from paddle import fluid
+import paddle.nn as nn
+from paddle.nn import Conv3D, BatchNorm3D
+from functools import partial
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 weight_attr=None,
+                 bias_attr=None,
+                 name=None,
+                 data_format="NCDHW"):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv3D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            padding_mode=padding_mode,
+            weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal(
+                fan_in=num_filters * filter_size * filter_size), name=name+'_weights'),
+            bias_attr=bias_attr,
+            data_format=data_format)
+        bn_name = "bn_" + name
+        self._batch_norm = BatchNorm3D(
+            num_filters,
+            momentum=0.9,
+            epsilon=1e-05,
+            weight_attr=ParamAttr(initializer=nn.initializer.Constant(
+                1.), name=bn_name + '_scale'),
+            bias_attr=ParamAttr(initializer=nn.initializer.Constant(
+                0.), name=bn_name + '_offset'),
+            data_format=data_format)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+def _downsample_basic_block(self, x, planes, stride):
+    out = fluid.layers.pool3d(
+        x, pool_size=1, pool_stride=stride, pool_type='avg')
+    shape = out.shape
+    zero_pads = fluid.layers.zeros([shape[0], planes - shape[1], shape[2], shape[3], shape[4]],
+                                   dtype='float32')
+    out = fluid.layers.concat([out, zero_pads], axis=1)
+
+
+class BottleneckBlock(nn.Layer):
+    expansion = 2
+
+    def __init__(self, inplanes, planes, cardinality, stride=1, downsample=None, name=None):
+        super(BottleneckBlock, self).__init__()
+
+        mid_planes = cardinality * int(planes / 32)
+        self.conv0 = ConvBNLayer(
+            inplanes, mid_planes, filter_size=1, bias_attr=False, name=name+'_branch2a')
+        self.conv1 = ConvBNLayer(mid_planes, mid_planes, filter_size=3, stride=stride,
+                                 padding=1, groups=cardinality, bias_attr=False, name=name+'_branch2b')
+        self.conv2 = ConvBNLayer(mid_planes, planes * self.expansion,
+                                 filter_size=1, bias_attr=False, name=name+'_branch2c')
+        self.downsample = downsample
+        self.stride = stride
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv0(x)
+        out = self.relu(out)
+        out = self.conv1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNeXt(nn.Layer):
+    def __init__(self,
+                 block,
+                 layers,
+                 shortcut_type='B',
+                 cardinality=32):
+        self.inplanes = 64
+        super(ResNeXt, self).__init__()
+        self.conv = ConvBNLayer(
+            3,
+            64,
+            filter_size=7,
+            stride=(1, 2, 2),
+            padding=(3, 3, 3),
+            bias_attr=False,
+            name="res_conv1"
+        )
+        self.relu = nn.ReLU()
+        self.maxpool = nn.MaxPool3D(kernel_size=(3, 3, 3), stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 128, layers[0], shortcut_type,
+                                       cardinality, stride=1, name='layer1')
+
+        self.layer2 = self._make_layer(
+            block, 256, layers[1], shortcut_type, cardinality, stride=2, name='layer2')
+
+        self.layer3 = self._make_layer(
+            block, 512, layers[2], shortcut_type, cardinality, stride=2, name='layer3')
+
+        self.layer4 = self._make_layer(
+            block, 1024, layers[3], shortcut_type, cardinality, stride=2, name='layer4')
+        self.avgpool = nn.AvgPool3D((2, 1, 1), stride=1, exclusive=False)
+
+    def _make_layer(self,
+                    block,
+                    planes,
+                    blocks,
+                    shortcut_type,
+                    cardinality,
+                    stride=1,
+                    name=None):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            if shortcut_type == 'A':
+                downsample = partial(self._downsample_basic_block,
+                                     planes=planes * block.expansion,
+                                     stride=stride)
+            else:
+                downsample = ConvBNLayer(
+                    self.inplanes,
+                    planes * block.expansion,
+                    1,
+                    stride=stride,
+                    bias_attr=False,
+                    name=name+'downsample'
+                )
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, cardinality, stride, downsample, name=name+'_downsample'))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes,
+                          cardinality, name=name+'_res_block'+str(i)))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        return x
+
+
+def ResNext101():
+    """Constructs a ResNext-101 model.
+    """
+    model = ResNeXt(BottleneckBlock, [3, 4, 23, 3])
+    return model
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/stgcn.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/stgcn.py
new file mode 100644
index 0000000..40d9d0d
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/stgcn.py
@@ -0,0 +1,343 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+from ..registry import BACKBONES
+from ..weight_init import weight_init_
+
+
+def zero(x):
+    return 0
+
+
+def iden(x):
+    return x
+
+
+def einsum(x, A):
+    """paddle.einsum will be implemented in release/2.2.
+    """
+    x = x.transpose((0, 2, 3, 1, 4))
+    n, c, t, k, v = x.shape
+    k2, v2, w = A.shape
+    assert (k == k2 and v == v2), "Args of einsum not match!"
+    x = x.reshape((n, c, t, k * v))
+    A = A.reshape((k * v, w))
+    y = paddle.matmul(x, A)
+    return y
+
+
+def get_hop_distance(num_node, edge, max_hop=1):
+    A = np.zeros((num_node, num_node))
+    for i, j in edge:
+        A[j, i] = 1
+        A[i, j] = 1
+
+    # compute hop steps
+    hop_dis = np.zeros((num_node, num_node)) + np.inf
+    transfer_mat = [np.linalg.matrix_power(A, d) for d in range(max_hop + 1)]
+    arrive_mat = (np.stack(transfer_mat) > 0)
+    for d in range(max_hop, -1, -1):
+        hop_dis[arrive_mat[d]] = d
+    return hop_dis
+
+
+def normalize_digraph(A):
+    Dl = np.sum(A, 0)
+    num_node = A.shape[0]
+    Dn = np.zeros((num_node, num_node))
+    for i in range(num_node):
+        if Dl[i] > 0:
+            Dn[i, i] = Dl[i]**(-1)
+    AD = np.dot(A, Dn)
+    return AD
+
+
+class Graph():
+
+    def __init__(self,
+                 layout='openpose',
+                 strategy='uniform',
+                 max_hop=1,
+                 dilation=1):
+        self.max_hop = max_hop
+        self.dilation = dilation
+
+        self.get_edge(layout)
+        self.hop_dis = get_hop_distance(self.num_node,
+                                        self.edge,
+                                        max_hop=max_hop)
+        self.get_adjacency(strategy)
+
+    def __str__(self):
+        return self.A
+
+    def get_edge(self, layout):
+        # edge is a list of [child, parent] paris
+
+        if layout == 'fsd10':
+            self.num_node = 25
+            self_link = [(i, i) for i in range(self.num_node)]
+            neighbor_link = [(1, 8), (0, 1), (15, 0), (17, 15), (16, 0),
+                             (18, 16), (5, 1), (6, 5), (7, 6), (2, 1), (3, 2),
+                             (4, 3), (9, 8), (10, 9), (11, 10), (24, 11),
+                             (22, 11), (23, 22), (12, 8), (13, 12), (14, 13),
+                             (21, 14), (19, 14), (20, 19)]
+            self.edge = self_link + neighbor_link
+            self.center = 8
+        elif layout == 'ntu-rgb+d':
+            self.num_node = 25
+            self_link = [(i, i) for i in range(self.num_node)]
+            neighbor_1base = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5),
+                              (7, 6), (8, 7), (9, 21), (10, 9), (11, 10),
+                              (12, 11), (13, 1), (14, 13), (15, 14), (16, 15),
+                              (17, 1), (18, 17), (19, 18), (20, 19), (22, 23),
+                              (23, 8), (24, 25), (25, 12)]
+            neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base]
+            self.edge = self_link + neighbor_link
+            self.center = 21 - 1
+        elif layout == 'coco_keypoint':
+            self.num_node = 17
+            self_link = [(i, i) for i in range(self.num_node)]
+            neighbor_1base = [(0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6),
+                              (5, 7), (6, 8), (7, 9), (8, 10), (5, 11), (6, 12),
+                              (11, 13), (12, 14), (13, 15), (14, 16), (11, 12)]
+            neighbor_link = [(i, j) for (i, j) in neighbor_1base]
+            self.edge = self_link + neighbor_link
+            self.center = 11
+        else:
+            raise ValueError("Do Not Exist This Layout.")
+
+    def get_adjacency(self, strategy):
+        valid_hop = range(0, self.max_hop + 1, self.dilation)
+        adjacency = np.zeros((self.num_node, self.num_node))
+        for hop in valid_hop:
+            adjacency[self.hop_dis == hop] = 1
+        normalize_adjacency = normalize_digraph(adjacency)
+
+        if strategy == 'spatial':
+            A = []
+            for hop in valid_hop:
+                a_root = np.zeros((self.num_node, self.num_node))
+                a_close = np.zeros((self.num_node, self.num_node))
+                a_further = np.zeros((self.num_node, self.num_node))
+                for i in range(self.num_node):
+                    for j in range(self.num_node):
+                        if self.hop_dis[j, i] == hop:
+                            if self.hop_dis[j, self.center] == self.hop_dis[
+                                    i, self.center]:
+                                a_root[j, i] = normalize_adjacency[j, i]
+                            elif self.hop_dis[j, self.center] > self.hop_dis[
+                                    i, self.center]:
+                                a_close[j, i] = normalize_adjacency[j, i]
+                            else:
+                                a_further[j, i] = normalize_adjacency[j, i]
+                if hop == 0:
+                    A.append(a_root)
+                else:
+                    A.append(a_root + a_close)
+                    A.append(a_further)
+            A = np.stack(A)
+            self.A = A
+        else:
+            raise ValueError("Do Not Exist This Strategy")
+
+
+class ConvTemporalGraphical(nn.Layer):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 t_kernel_size=1,
+                 t_stride=1,
+                 t_padding=0,
+                 t_dilation=1):
+        super().__init__()
+
+        self.kernel_size = kernel_size
+        self.conv = nn.Conv2D(in_channels,
+                              out_channels * kernel_size,
+                              kernel_size=(t_kernel_size, 1),
+                              padding=(t_padding, 0),
+                              stride=(t_stride, 1),
+                              dilation=(t_dilation, 1))
+
+    def forward(self, x, A):
+        assert A.shape[0] == self.kernel_size
+
+        x = self.conv(x)
+        n, kc, t, v = x.shape
+        x = x.reshape((n, self.kernel_size, kc // self.kernel_size, t, v))
+        x = einsum(x, A)
+
+        return x, A
+
+
+class st_gcn_block(nn.Layer):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 dropout=0,
+                 residual=True):
+        super(st_gcn_block, self).__init__()
+
+        assert len(kernel_size) == 2
+        assert kernel_size[0] % 2 == 1
+        padding = ((kernel_size[0] - 1) // 2, 0)
+
+        self.gcn = ConvTemporalGraphical(in_channels, out_channels,
+                                         kernel_size[1])
+
+        self.tcn = nn.Sequential(
+            nn.BatchNorm2D(out_channels),
+            nn.ReLU(),
+            nn.Conv2D(
+                out_channels,
+                out_channels,
+                (kernel_size[0], 1),
+                (stride, 1),
+                padding,
+            ),
+            nn.BatchNorm2D(out_channels),
+            nn.Dropout(dropout),
+        )
+
+        if not residual:
+            self.residual = zero
+
+        elif (in_channels == out_channels) and (stride == 1):
+            self.residual = iden
+
+        else:
+            self.residual = nn.Sequential(
+                nn.Conv2D(in_channels,
+                          out_channels,
+                          kernel_size=1,
+                          stride=(stride, 1)),
+                nn.BatchNorm2D(out_channels),
+            )
+
+        self.relu = nn.ReLU()
+
+    def forward(self, x, A):
+        res = self.residual(x)
+        x, A = self.gcn(x, A)
+        x = self.tcn(x) + res
+        return self.relu(x), A
+
+
+@BACKBONES.register()
+class STGCN(nn.Layer):
+    """
+    ST-GCN model from:
+    `"Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition" <https://arxiv.org/abs/1801.07455>`_
+    Args:
+        in_channels: int, channels of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default 2.
+        edge_importance_weighting: bool, whether to use edge attention. Default True.
+        data_bn: bool, whether to use data BatchNorm. Default True.
+    """
+
+    def __init__(self,
+                 in_channels=2,
+                 edge_importance_weighting=True,
+                 data_bn=True,
+                 layout='fsd10',
+                 strategy='spatial',
+                 **kwargs):
+        super(STGCN, self).__init__()
+        self.data_bn = data_bn
+        # load graph
+        self.graph = Graph(
+            layout=layout,
+            strategy=strategy,
+        )
+        A = paddle.to_tensor(self.graph.A, dtype='float32')
+        self.register_buffer('A', A)
+
+        # build networks
+        spatial_kernel_size = A.shape[0]
+        temporal_kernel_size = 9
+        kernel_size = (temporal_kernel_size, spatial_kernel_size)
+        self.data_bn = nn.BatchNorm1D(in_channels *
+                                      A.shape[1]) if self.data_bn else iden
+        kwargs0 = {k: v for k, v in kwargs.items() if k != 'dropout'}
+        self.st_gcn_networks = nn.LayerList((
+            st_gcn_block(in_channels,
+                         64,
+                         kernel_size,
+                         1,
+                         residual=False,
+                         **kwargs0),
+            st_gcn_block(64, 64, kernel_size, 1, **kwargs),
+            st_gcn_block(64, 64, kernel_size, 1, **kwargs),
+            st_gcn_block(64, 64, kernel_size, 1, **kwargs),
+            st_gcn_block(64, 128, kernel_size, 2, **kwargs),
+            st_gcn_block(128, 128, kernel_size, 1, **kwargs),
+            st_gcn_block(128, 128, kernel_size, 1, **kwargs),
+            st_gcn_block(128, 256, kernel_size, 2, **kwargs),
+            st_gcn_block(256, 256, kernel_size, 1, **kwargs),
+            st_gcn_block(256, 256, kernel_size, 1, **kwargs),
+        ))
+
+        # initialize parameters for edge importance weighting
+        if edge_importance_weighting:
+            self.edge_importance = nn.ParameterList([
+                self.create_parameter(
+                    shape=self.A.shape,
+                    default_initializer=nn.initializer.Constant(1))
+                for i in self.st_gcn_networks
+            ])
+        else:
+            self.edge_importance = [1] * len(self.st_gcn_networks)
+
+        self.pool = nn.AdaptiveAvgPool2D(output_size=(1, 1))
+
+    def init_weights(self):
+        """Initiate the parameters.
+        """
+        for layer in self.sublayers():
+            if isinstance(layer, nn.Conv2D):
+                weight_init_(layer, 'Normal', mean=0.0, std=0.02)
+            elif isinstance(layer, nn.BatchNorm2D):
+                weight_init_(layer, 'Normal', mean=1.0, std=0.02)
+            elif isinstance(layer, nn.BatchNorm1D):
+                weight_init_(layer, 'Normal', mean=1.0, std=0.02)
+
+    def forward(self, x):
+        # data normalization
+        N, C, T, V, M = x.shape
+        x = x.transpose((0, 4, 3, 1, 2))  # N, M, V, C, T
+        x = x.reshape((N * M, V * C, T))
+        if self.data_bn:
+            x.stop_gradient = False
+        x = self.data_bn(x)
+        x = x.reshape((N, M, V, C, T))
+        x = x.transpose((0, 1, 3, 4, 2))  # N, M, C, T, V
+        x = x.reshape((N * M, C, T, V))
+
+        # forward
+        for gcn, importance in zip(self.st_gcn_networks, self.edge_importance):
+            x, _ = gcn(x, paddle.multiply(self.A, importance))
+
+        x = self.pool(x)  # NM,C,T,V --> NM,C,1,1
+        C = x.shape[1]
+        x = paddle.reshape(x, (N, M, C, 1, 1)).mean(axis=1)  # N,C,1,1
+        return x
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/swin_transformer.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/swin_transformer.py
new file mode 100644
index 0000000..aaed217
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/swin_transformer.py
@@ -0,0 +1,742 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import lru_cache, reduce
+from operator import mul
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import Constant
+
+from ...utils import load_ckpt
+from ..registry import BACKBONES
+from ..weight_init import trunc_normal_
+
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    # issuecomment-532968956 ...
+    See discussion: https://github.com/tensorflow/tpu/issues/494
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob)
+    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Mlp(nn.Layer):
+    """ Multilayer perceptron."""
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """window_partition
+    Args:
+        x (Tensor): x.shape = [B, D, H, W, C]
+        window_size (tuple[int]): window_size
+
+    Returns:
+        Tensor: (B*num_windows, window_size*window_size, C)
+    """
+    B, D, H, W, C = x.shape
+    x = x.reshape([
+        B, D // window_size[0], window_size[0], H // window_size[1],
+        window_size[1], W // window_size[2], window_size[2], C
+    ])
+    windows = x.transpose([0, 1, 3, 5, 2, 4, 6,
+                           7]).reshape([-1, reduce(mul, window_size), C])
+    return windows
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+def window_reverse(windows, window_size, B, D, H, W):
+    """
+    Args:
+        windows: (B*num_windows, window_size, window_size, C)
+        window_size (tuple[int]): Window size
+        H (int): Height of image
+        W (int): Width of image
+
+    Returns:
+        x: (B, D, H, W, C)
+    """
+    x = windows.reshape([
+        B, D // window_size[0], H // window_size[1], W // window_size[2],
+        window_size[0], window_size[1], window_size[2], -1
+    ])
+    x = x.transpose([0, 1, 4, 2, 5, 3, 6, 7]).reshape([B, D, H, W, -1])
+    return x
+
+
+def get_window_size(x_size, window_size, shift_size=None):
+    use_window_size = list(window_size)
+    if shift_size is not None:
+        use_shift_size = list(shift_size)
+    for i in range(len(x_size)):
+        if x_size[i] <= window_size[i]:
+            use_window_size[i] = x_size[i]
+            if shift_size is not None:
+                use_shift_size[i] = 0
+
+    if shift_size is None:
+        return tuple(use_window_size)
+    else:
+        return tuple(use_window_size), tuple(use_shift_size)
+
+
+class WindowAttention3D(nn.Layer):
+    """ Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The temporal length, height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wd, Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = self.create_parameter(
+            shape=((2 * window_size[0] - 1) * (2 * window_size[1] - 1) *
+                   (2 * window_size[2] - 1), num_heads),
+            default_initializer=zeros_,
+        )  # 2*Wd-1 * 2*Wh-1 * 2*Ww-1, nH
+        self.add_parameter("relative_position_bias_table",
+                           self.relative_position_bias_table)
+        # get pair-wise relative position index for each token inside the window
+        coords_d = paddle.arange(self.window_size[0])
+        coords_h = paddle.arange(self.window_size[1])
+        coords_w = paddle.arange(self.window_size[2])
+        coords = paddle.stack(paddle.meshgrid(coords_d, coords_h,
+                                              coords_w))  # 3, Wd, Wh, Ww
+        coords_flatten = paddle.flatten(coords, 1)  # 3, Wd*Wh*Ww
+
+        relative_coords = coords_flatten.unsqueeze(
+            axis=2) - coords_flatten.unsqueeze(axis=1)  # 3, Wd*Wh*Ww, Wd*Wh*Ww
+
+        # relative_coords = coords_flatten.unsqueeze(2) - coords_flatten.unsqueeze(1)  # 3, Wd*Wh*Ww, Wd*Wh*Ww
+        relative_coords = relative_coords.transpose([1, 2, 0
+                                                     ])  # Wd*Wh*Ww, Wd*Wh*Ww, 3
+        relative_coords[:, :,
+                        0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 2] += self.window_size[2] - 1
+
+        relative_coords[:, :, 0] *= (2 * self.window_size[1] -
+                                     1) * (2 * self.window_size[2] - 1)
+        relative_coords[:, :, 1] *= (2 * self.window_size[2] - 1)
+        relative_position_index = relative_coords.sum(
+            axis=-1)  # Wd*Wh*Ww, Wd*Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+        self.softmax = nn.Softmax(axis=-1)
+
+    def forward(self, x, mask=None):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, N, N) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(
+            [B_, N, 3, self.num_heads,
+             C // self.num_heads]).transpose([2, 0, 3, 1, 4])
+        q, k, v = qkv[0], qkv[1], qkv[2]  # B_, nH, N, C
+
+        q = q * self.scale
+        attn = q @ k.transpose([0, 1, 3, 2])
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index[:N, :N].reshape([-1])].reshape(
+                [N, N, -1])  # Wd*Wh*Ww,Wd*Wh*Ww,nH
+        relative_position_bias = relative_position_bias.transpose(
+            [2, 0, 1])  # nH, Wd*Wh*Ww, Wd*Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)  # B_, nH, N, N
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.reshape([B_ // nW, nW, self.num_heads, N, N
+                                 ]) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.reshape([-1, self.num_heads, N, N])
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose([0, 2, 1, 3]).reshape([B_, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock3D(nn.Layer):
+    """ Swin Transformer Block.
+
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): Window size.
+        shift_size (tuple[int]): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=(2, 7, 7),
+                 shift_size=(0, 0, 0),
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 use_checkpoint=False):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        # self.use_checkpoint=use_checkpoint
+
+        assert 0 <= self.shift_size[0] < self.window_size[
+            0], "shift_size must in 0-window_size"
+        assert 0 <= self.shift_size[1] < self.window_size[
+            1], "shift_size must in 0-window_size"
+        assert 0 <= self.shift_size[2] < self.window_size[
+            2], "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention3D(dim,
+                                      window_size=self.window_size,
+                                      num_heads=num_heads,
+                                      qkv_bias=qkv_bias,
+                                      qk_scale=qk_scale,
+                                      attn_drop=attn_drop,
+                                      proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+    def forward_part1(self, x, mask_matrix):
+        B = paddle.shape(x)[0]
+        _, D, H, W, C = x.shape
+        window_size, shift_size = get_window_size((D, H, W), self.window_size,
+                                                  self.shift_size)
+
+        x = self.norm1(x)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = pad_d0 = 0
+        pad_d1 = (window_size[0] - D % window_size[0]) % window_size[0]
+        pad_b = (window_size[1] - H % window_size[1]) % window_size[1]
+        pad_r = (window_size[2] - W % window_size[2]) % window_size[2]
+        x = F.pad(x, (pad_l, pad_r, pad_t, pad_b, pad_d0, pad_d1),
+                  data_format='NDHWC')
+        _, Dp, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if any(i > 0 for i in shift_size):
+            shifted_x = paddle.roll(x,
+                                    shifts=(-shift_size[0], -shift_size[1],
+                                            -shift_size[2]),
+                                    axis=(1, 2, 3))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(shifted_x,
+                                     window_size)  # B*nW, Wd*Wh*Ww, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # B*nW, Wd*Wh*Ww, C
+        # merge windows
+        attn_windows = attn_windows.reshape([-1, *(window_size + (C, ))])
+        shifted_x = window_reverse(attn_windows, window_size, B, Dp, Hp,
+                                   Wp)  # B D' H' W' C
+        # reverse cyclic shift
+        if any(i > 0 for i in shift_size):
+            x = paddle.roll(shifted_x,
+                            shifts=(shift_size[0], shift_size[1],
+                                    shift_size[2]),
+                            axis=(1, 2, 3))
+        else:
+            x = shifted_x
+
+        if pad_d1 > 0 or pad_r > 0 or pad_b > 0:
+            x = x[:, :D, :H, :W, :]
+        return x
+
+    def forward_part2(self, x):
+        return self.drop_path(self.mlp(self.norm2(x)))
+
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, D, H, W, C).
+            mask_matrix: Attention mask for cyclic shift.
+        """
+
+        shortcut = x
+        x = self.forward_part1(x, mask_matrix)
+        x = shortcut + self.drop_path(x)
+        x = x + self.forward_part2(x)
+
+        return x
+
+
+class PatchMerging(nn.Layer):
+    """ Patch Merging Layer
+
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x):
+        """ Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, D, H, W, C).
+        """
+        B, D, H, W, C = x.shape
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, W % 2, 0, H % 2, 0, 0), data_format='NDHWC')
+
+        x0 = x[:, :, 0::2, 0::2, :]  # B D H/2 W/2 C
+        x1 = x[:, :, 1::2, 0::2, :]  # B D H/2 W/2 C
+        x2 = x[:, :, 0::2, 1::2, :]  # B D H/2 W/2 C
+        x3 = x[:, :, 1::2, 1::2, :]  # B D H/2 W/2 C
+        x = paddle.concat([x0, x1, x2, x3], -1)  # B D H/2 W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+# cache each stage results
+@lru_cache()
+def compute_mask(D, H, W, window_size, shift_size):
+    img_mask = paddle.zeros((1, D, H, W, 1))  # 1 Dp Hp Wp 1
+    cnt = 0
+    for d in slice(-window_size[0]), slice(-window_size[0],
+                                           -shift_size[0]), slice(
+                                               -shift_size[0], None):
+        for h in slice(-window_size[1]), slice(-window_size[1],
+                                               -shift_size[1]), slice(
+                                                   -shift_size[1], None):
+            for w in slice(-window_size[2]), slice(-window_size[2],
+                                                   -shift_size[2]), slice(
+                                                       -shift_size[2], None):
+                img_mask[:, d, h, w, :] = cnt
+                cnt += 1
+    mask_windows = window_partition(img_mask,
+                                    window_size)  # nW, ws[0]*ws[1]*ws[2], 1
+    mask_windows = mask_windows.squeeze(-1)  # nW, ws[0]*ws[1]*ws[2]
+    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+    # attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+    huns = -100.0 * paddle.ones_like(attn_mask)
+    attn_mask = huns * (attn_mask != 0).astype("float32")
+    return attn_mask
+
+
+class BasicLayer(nn.Layer):
+    """ A basic Swin Transformer layer for one stage.
+
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (tuple[int]): Local window size. Default: (1,7,7).
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
+    """
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=(1, 7, 7),
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = tuple(i // 2 for i in window_size)
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.LayerList([
+            SwinTransformerBlock3D(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=(0, 0, 0) if (i % 2 == 0) else self.shift_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer,
+                use_checkpoint=use_checkpoint,
+            ) for i in range(depth)
+        ])
+
+        self.downsample = downsample
+        if self.downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+
+    def forward(self, x):
+        """ Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, C, D, H, W).
+        """
+        # calculate attention mask for SW-MSA
+        B = paddle.shape(x)[0]
+        _, C, D, H, W = x.shape
+        window_size, shift_size = get_window_size((D, H, W), self.window_size,
+                                                  self.shift_size)
+        # x = rearrange(x, 'b c d h w -> b d h w c')
+        x = x.transpose([0, 2, 3, 4, 1])
+        Dp = int(np.ceil(D / window_size[0])) * window_size[0]
+        Hp = int(np.ceil(H / window_size[1])) * window_size[1]
+        Wp = int(np.ceil(W / window_size[2])) * window_size[2]
+        attn_mask = compute_mask(Dp, Hp, Wp, window_size, shift_size)
+        for blk in self.blocks:
+            x = blk(x, attn_mask)
+        x = x.reshape([B, D, H, W, C])
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+        x = x.transpose([0, 4, 1, 2, 3])
+        return x
+
+
+class PatchEmbed3D(nn.Layer):
+    """ Video to Patch Embedding.
+
+    Args:
+        patch_size (int): Patch token size. Default: (2,4,4).
+        in_chans (int): Number of input video channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Layer, optional): Normalization layer. Default: None
+    """
+    def __init__(self,
+                 patch_size=(2, 4, 4),
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None):
+        super().__init__()
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv3D(in_chans,
+                              embed_dim,
+                              kernel_size=patch_size,
+                              stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        _, _, D, H, W = x.shape
+        if W % self.patch_size[2] != 0:
+            x = F.pad(
+                x, (0, self.patch_size[2] - W % self.patch_size[2], 0, 0, 0, 0),
+                data_format='NCDHW')
+        if H % self.patch_size[1] != 0:
+            x = F.pad(
+                x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1], 0, 0),
+                data_format='NCDHW')
+        if D % self.patch_size[0] != 0:
+            x = F.pad(
+                x, (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0]),
+                data_format='NCDHW')
+
+        x = self.proj(x)  # B C D Wh Ww
+        if self.norm is not None:
+            D, Wh, Ww = x.shape[2], x.shape[3], x.shape[4]
+            x = x.flatten(2).transpose([0, 2, 1])
+            x = self.norm(x)
+            x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, D, Wh, Ww])
+
+        return x
+
+
+@BACKBONES.register()
+class SwinTransformer3D(nn.Layer):
+    """ Swin Transformer backbone.
+        A Paddle impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+
+    Args:
+        patch_size (int | tuple(int)): Patch size. Default: (4,4,4).
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: Truee
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer: Normalization layer. Default: nn.LayerNorm.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+    """
+    def __init__(self,
+                 pretrained=None,
+                 patch_size=(4, 4, 4),
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=(2, 7, 7),
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 patch_norm=False,
+                 frozen_stages=-1,
+                 use_checkpoint=False):
+        super().__init__()
+
+        self.pretrained = pretrained
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.patch_norm = patch_norm
+        self.frozen_stages = frozen_stages
+        self.window_size = window_size
+        self.patch_size = patch_size
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed3D(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = [
+            x.item() for x in paddle.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.LayerList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging
+                if i_layer < self.num_layers - 1 else None,
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+
+        self.num_features = int(embed_dim * 2**(self.num_layers - 1))
+
+        # add a norm layer for each output
+        self.norm = norm_layer(self.num_features)
+
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.stop_gradient = True
+
+        if self.frozen_stages >= 1:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.stop_gradient = True
+
+    def _init_fn(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def init_weights(self):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        """First init model's weight"""
+
+        self.apply(self._init_fn)
+        """Second, if provide pretrained ckpt, load it"""
+        if isinstance(
+                self.pretrained, str
+        ) and self.pretrained.strip() != "":  # load pretrained weights
+            load_ckpt(self, self.pretrained)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            pass
+        else:
+            raise NotImplementedError
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+        x = self.pos_drop(x)
+
+        for layer in self.layers:
+            x = layer(x)
+
+        x = x.transpose([0, 2, 3, 4, 1])
+        x = self.norm(x)
+        x = x.transpose([0, 4, 1, 2, 3])
+        return x
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer3D, self).train(mode)
+        self._freeze_stages()
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/toshift_vit.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/toshift_vit.py
new file mode 100644
index 0000000..a481996
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/toshift_vit.py
@@ -0,0 +1,413 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Callable
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import Constant
+
+from ...utils import load_ckpt
+from ..registry import BACKBONES
+from ..weight_init import trunc_normal_
+
+__all__ = ['VisionTransformer']
+
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def to_2tuple(x):
+    return tuple([x] * 2)
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    # issuecomment-532968956 ...
+    See discussion: https://github.com/tensorflow/tpu/issues/494
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob)
+    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.0):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.0,
+                 proj_drop=0.0):
+        super().__init__()
+
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.attn_drop = nn.Dropout(attn_drop)
+
+    def forward(self, x):
+        N, C = x.shape[1:]
+        qkv = self.qkv(x).reshape(
+            (-1, N, 3, self.num_heads, C // self.num_heads)).transpose(
+                (2, 0, 3, 1, 4))
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.0,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.0,
+                 attn_drop=0.0,
+                 drop_path=0.1,
+                 act_layer=nn.GELU,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 num_segments = 8,
+                 fold_div = 4):
+                #attention_type='divided_space_time',
+        super().__init__()
+        self.n_seg = num_segments       #ckk
+        self.foldP_div = fold_div       #ckk
+        #self.attention_type = attention_type
+        if isinstance(norm_layer, str):
+            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+        elif isinstance(norm_layer, Callable):
+            self.norm1 = norm_layer(dim, epsilon=epsilon)
+        else:
+            raise TypeError(
+                "The norm_layer must be str or paddle.nn.layer.Layer class")
+
+        self.attn = Attention(dim,
+                              num_heads=num_heads,
+                              qkv_bias=qkv_bias,
+                              qk_scale=qk_scale,
+                              attn_drop=attn_drop,
+                              proj_drop=drop)
+
+        # Temporal Attention Parameters
+        '''
+        if self.attention_type == 'divided_space_time':
+            if isinstance(norm_layer, str):
+                self.temporal_norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+            elif isinstance(norm_layer, Callable):
+                self.temporal_norm1 = norm_layer(dim, epsilon=epsilon)
+            else:
+                raise TypeError(
+                    "The norm_layer must be str or paddle.nn.layer.Layer class")
+            self.temporal_attn = Attention(dim,
+                                           num_heads=num_heads,
+                                           qkv_bias=qkv_bias,
+                                           qk_scale=qk_scale,
+                                           attn_drop=attn_drop,
+                                           proj_drop=drop)
+            self.temporal_fc = nn.Linear(dim, dim)
+        '''
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        if isinstance(norm_layer, str):
+            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+        elif isinstance(norm_layer, Callable):
+            self.norm2 = norm_layer(dim, epsilon=epsilon)
+        else:
+            raise TypeError(
+                "The norm_layer must be str or paddle.nn.layer.Layer class")
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+    # token_shift
+    def shuift_tk(self, x):
+        t = self.n_seg
+        bt, n, c = x.shape
+        b = bt // t
+        x = x.reshape([b, t, n, c]) #B T N C
+        
+        fold = c // self.foldP_div
+        out = paddle.zeros_like(x)
+        out.stop_gradient = True
+        # print("#### fold ", fold)
+        # print(out.shape)
+        # print(x[:, 1:, 0, :fold].unsqueeze(2).shape)
+        # print(out[:, :-1, 0:1, :fold].shape)
+        # exit(0)
+        out[:, :-1, 0, :fold] = x[:, 1:, 0, :fold] # shift left
+        out[:, 1:,  0, fold:2*fold] = x[:,:-1:, 0, fold:2*fold]
+        
+        out[:, :, 1:, :2*fold] = x[:, :, 1:, :2*fold]
+        out[:, :, :, 2*fold:] = x[:, :, :, 2*fold:]
+        
+        return out.reshape([bt, n, c])
+    
+    def forward(self, x):
+        x = self.shuift_tk(x)
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = self.shuift_tk(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] //
+                                                        patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2D(in_channels,
+                              embed_dim,
+                              kernel_size=patch_size,
+                              stride=patch_size)
+
+    def forward(self, x):
+        B, C, T, H, W = x.shape
+
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = x.transpose((0, 2, 1, 3, 4))
+        x = x.reshape([-1, C, H, W])
+        x = self.proj(x)
+        W = x.shape[-1]
+        x = x.flatten(2).transpose((0, 2, 1))
+        return x, T, W
+
+
+@BACKBONES.register()
+class TokenShiftVisionTransformer(nn.Layer):
+    """ Vision Transformer with support for patch input
+    """
+    def __init__(self,
+                 pretrained=None,
+                 img_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 num_seg=8,
+                 attention_type='divided_space_time',
+                 **args):
+
+        super().__init__()
+        self.pretrained = pretrained
+        self.num_seg = num_seg
+        self.attention_type = attention_type
+        self.num_features = self.embed_dim = embed_dim
+
+        self.patch_embed = PatchEmbed(img_size=img_size,
+                                      patch_size=patch_size,
+                                      in_channels=in_channels,
+                                      embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        # Positional Embeddings
+        self.cls_token = self.create_parameter(shape=(1, 1, embed_dim),
+                                               default_initializer=zeros_)
+        self.pos_embed = self.create_parameter(shape=(1, num_patches + 1,
+                                                      embed_dim),
+                                               default_initializer=zeros_)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if self.attention_type != 'space_only':
+            self.time_embed = self.create_parameter(shape=(1, num_seg,
+                                                           embed_dim),
+                                                    default_initializer=zeros_)
+            self.time_drop = nn.Dropout(p=drop_rate)
+
+        self.add_parameter("pos_embed", self.pos_embed)
+        self.add_parameter("cls_token", self.cls_token)
+
+        dpr = np.linspace(0, drop_path_rate, depth)
+
+        self.blocks = nn.LayerList([
+            Block(dim=embed_dim,
+                  num_heads=num_heads,
+                  mlp_ratio=mlp_ratio,
+                  qkv_bias=qkv_bias,
+                  qk_scale=qk_scale,
+                  drop=drop_rate,
+                  attn_drop=attn_drop_rate,
+                  drop_path=dpr[i],
+                  norm_layer=norm_layer,
+                  epsilon=epsilon,
+                  num_segments= self.num_seg
+                  ) for i in range(depth)
+                #attention_type=self.attention_type
+        ])
+
+        self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)
+
+    def init_weights(self):
+        """First init model's weight"""
+        trunc_normal_(self.pos_embed, std=0.02)
+        trunc_normal_(self.cls_token, std=0.02)
+        self.apply(self._init_fn)
+
+        if self.attention_type == 'divided_space_time':
+            i = 0
+            for m in self.blocks.sublayers(include_self=True):
+                m_str = str(m)
+                if 'Block' in m_str:
+                    if i > 0:
+                        zeros_(m.temporal_fc.weight)
+                        zeros_(m.temporal_fc.bias)
+                    i += 1
+
+        """Second, if provide pretrained ckpt, load it"""
+
+        if isinstance(
+                self.pretrained, str
+        ) and self.pretrained.strip() != "":  # load pretrained weights
+            load_ckpt(self,
+                      self.pretrained,
+                      num_patches=self.patch_embed.num_patches,
+                      num_seg=self.num_seg,
+                      attention_type=self.attention_type)
+
+    def _init_fn(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            ones_(m.weight)
+            zeros_(m.bias)
+
+    def forward_features(self, x):
+        # B = x.shape[0]
+        B = paddle.shape(x)[0]
+        x, T, W = self.patch_embed(x)  # [BT,nH*nW,F]
+        cls_tokens = self.cls_token.expand((B * T, -1, -1))  # [1,1,F]->[BT,1,F]
+        x = paddle.concat((cls_tokens, x), axis=1)
+        pos_interp = (x.shape[1] != self.pos_embed.shape[1])
+        if pos_interp:
+            pos_embed = self.pos_embed
+            cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1)
+            other_pos_embed = pos_embed[0, 1:, :].unsqueeze(0).transpose(
+                (0, 2, 1))
+            P = int(other_pos_embed.shape[2]**0.5)
+            H = x.shape[1] // W
+            other_pos_embed = other_pos_embed.reshape([1, x.shape[2], P, P])
+            new_pos_embed = F.interpolate(other_pos_embed,
+                                          size=(H, W),
+                                          mode='nearest')
+            new_pos_embed = new_pos_embed.flatten(2)
+            new_pos_embed = new_pos_embed.transpose((0, 2, 1))
+            new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed),
+                                          axis=1)
+            x = x + new_pos_embed
+        else:
+            x = x + self.pos_embed
+
+        x = self.pos_drop(x)
+
+        # Attention blocks
+        for blk in self.blocks:
+            x = blk(x)
+
+
+        x = self.norm(x)
+        return x[:, 0]  # [B,  embed_dim]  -> [B*T, embed_dim]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
\ No newline at end of file
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/transnetv2.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/transnetv2.py
new file mode 100644
index 0000000..60603e2
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/transnetv2.py
@@ -0,0 +1,582 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as functional
+import random
+from paddle import ParamAttr
+
+from ..registry import BACKBONES
+
+
+class OctConv3D(nn.Layer):
+    def __init__(self, in_filters, filters, kernel_size=3, dilation_rate=(1, 1, 1), alpha=0.25,
+                 use_bias=True, kernel_initializer=nn.initializer.KaimingNormal()):
+        super(OctConv3D, self).__init__()
+
+        self.low_channels = int(filters * alpha)
+        self.high_channels = filters - self.low_channels
+
+        self.high_to_high = nn.Conv3D(in_filters, self.high_channels, kernel_size=kernel_size,
+                                      dilation=dilation_rate, padding=(dilation_rate[0], 1, 1),
+                                      weight_attr=ParamAttr(initializer=kernel_initializer),
+                                      bias_attr=ParamAttr(
+                                          initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias)
+        self.high_to_low = nn.Conv3D(self.high_channels, self.low_channels, kernel_size=kernel_size,
+                                     dilation=dilation_rate, padding=(dilation_rate[0], 1, 1),
+                                     weight_attr=ParamAttr(initializer=kernel_initializer),
+                                     bias_attr=False)
+        self.low_to_high = nn.Conv3D(in_filters, self.high_channels, kernel_size=kernel_size,
+                                     dilation=dilation_rate, padding=(dilation_rate[0], 1, 1),
+                                     weight_attr=ParamAttr(initializer=kernel_initializer),
+                                     bias_attr=False)
+        self.low_to_low = nn.Conv3D(self.high_channels, self.low_channels, kernel_size=kernel_size,
+                                    dilation=dilation_rate, padding=(dilation_rate[0], 1, 1),
+                                    weight_attr=ParamAttr(initializer=kernel_initializer),
+                                    bias_attr=ParamAttr(
+                                        initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias)
+        self.upsampler = nn.Upsample(size=(1, 2, 2), data_format='NCDHW')
+        self.downsampler = nn.AvgPool3D(kernel_size=(1, 2, 2), stride=(1, 2, 2), padding=(0, 1, 1))
+
+    @staticmethod
+    def pad_to(tensor, target_shape):
+        shape = tensor.shape
+        padding = [[0, tar - curr] for curr, tar in zip(shape, target_shape)]
+        return functional.pad(tensor, padding, "CONSTANT", data_format='NCDHW')
+
+    @staticmethod
+    def crop_to(tensor, target_width, target_height):
+        return tensor[:, :, :target_height, :target_width]
+
+    def forward(self, inputs):
+        low_inputs, high_inputs = inputs
+
+        high_to_high = self.high_to_high(high_inputs)
+        high_to_low = self.high_to_low(self.downsampler(high_inputs))
+
+        low_to_high = self.upsampler(self.low_to_high(low_inputs))
+        low_to_low = self.low_to_low(low_inputs)
+
+        high_output = high_to_high[:, :, :, :low_to_high.shape[3], :low_to_high.shape[4]] + low_to_high
+        low_output = low_to_low + high_to_low[:, :, :, :low_to_low.shape[3], :low_to_low.shape[4]]
+
+        return low_output, high_output
+
+
+class Conv3DConfigurable(nn.Layer):
+    def __init__(self,
+                 in_filters,
+                 filters,
+                 dilation_rate,
+                 separable=True,
+                 octave=False,
+                 use_bias=True):
+        super(Conv3DConfigurable, self).__init__()
+        assert not (separable and octave)
+
+        if separable:
+            conv1 = nn.Conv3D(in_filters, 2 * filters, kernel_size=(1, 3, 3),
+                              dilation=(1, 1, 1), padding=(0, 1, 1),
+                              weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
+                              bias_attr=False)
+            conv2 = nn.Conv3D(2 * filters, filters, kernel_size=(3, 1, 1),
+                              dilation=(dilation_rate, 1, 1), padding=(dilation_rate, 0, 0),
+                              weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
+                              bias_attr=ParamAttr(
+                                  initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias)
+            self.layers = nn.LayerList([conv1, conv2])
+        elif octave:
+            conv = OctConv3D(in_filters, filters, kernel_size=3, dilation_rate=(dilation_rate, 1, 1),
+                             use_bias=use_bias,
+                             kernel_initializer=nn.initializer.KaimingNormal())
+            self.layers = [conv]
+        else:
+            conv = nn.Conv3D(in_filters, filters, kernel_size=3,
+                             dilation=(dilation_rate, 1, 1), padding=(dilation_rate, 1, 1),
+                             weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
+                             bias_attr=ParamAttr(
+                                 initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias)
+            self.layers = nn.LayerList([conv])
+
+    def forward(self, inputs):
+        x = inputs
+        for layer in self.layers:
+            x = layer(x)
+        return x
+
+
+class DilatedDCNNV2(nn.Layer):
+    def __init__(self,
+                 in_filters,
+                 filters,
+                 batch_norm=True,
+                 activation=None,
+                 octave_conv=False):
+        super(DilatedDCNNV2, self).__init__()
+        assert not (octave_conv and batch_norm)
+
+        self.Conv3D_1 = Conv3DConfigurable(in_filters, filters, 1, use_bias=not batch_norm, octave=octave_conv)
+        self.Conv3D_2 = Conv3DConfigurable(in_filters, filters, 2, use_bias=not batch_norm, octave=octave_conv)
+        self.Conv3D_4 = Conv3DConfigurable(in_filters, filters, 4, use_bias=not batch_norm, octave=octave_conv)
+        self.Conv3D_8 = Conv3DConfigurable(in_filters, filters, 8, use_bias=not batch_norm, octave=octave_conv)
+        self.octave = octave_conv
+
+        self.bn = nn.BatchNorm3D(filters * 4, momentum=0.99, epsilon=1e-03,
+                                 weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=1.)),
+                                 bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))
+                                 ) if batch_norm else None
+        self.activation = activation
+
+    def forward(self, inputs):
+        conv1 = self.Conv3D_1(inputs)
+        conv2 = self.Conv3D_2(inputs)
+        conv3 = self.Conv3D_4(inputs)
+        conv4 = self.Conv3D_8(inputs)
+
+        # shape of convi[j]/convi is [B, 3, T, H, W], concat in channel dimension
+        if self.octave:
+            x = [paddle.concat([conv1[0], conv2[0], conv3[0], conv4[0]], axis=1),
+                 paddle.concat([conv1[1], conv2[1], conv3[1], conv4[1]], axis=1)]
+        else:
+            x = paddle.concat([conv1, conv2, conv3, conv4], axis=1)
+
+        if self.bn is not None:
+            x = self.bn(x)
+
+        if self.activation is not None:
+            if self.octave:
+                x = [self.activation(x[0]), self.activation(x[1])]
+            else:
+                x = self.activation(x)
+        return x
+
+
+class StackedDDCNNV2(nn.Layer):
+    def __init__(self,
+                 in_filters,
+                 n_blocks,
+                 filters,
+                 shortcut=True,
+                 use_octave_conv=False,
+                 pool_type="avg",
+                 stochastic_depth_drop_prob=0.0):
+        super(StackedDDCNNV2, self).__init__()
+        assert pool_type == "max" or pool_type == "avg"
+        if use_octave_conv and pool_type == "max":
+            print("WARN: Octave convolution was designed with average pooling, not max pooling.")
+
+        self.shortcut = shortcut
+        self.DDCNN = nn.LayerList([
+            DilatedDCNNV2(in_filters if i == 1 else filters * 4, filters, octave_conv=use_octave_conv,
+                          activation=functional.relu if i != n_blocks else None) for i in range(1, n_blocks + 1)
+        ])
+        self.pool = nn.MaxPool3D(kernel_size=(1, 2, 2)) if pool_type == "max" else nn.AvgPool3D(kernel_size=(1, 2, 2))
+        self.octave = use_octave_conv
+        self.stochastic_depth_drop_prob = stochastic_depth_drop_prob
+
+    def forward(self, inputs):
+        x = inputs
+        shortcut = None
+
+        if self.octave:
+            x = [self.pool(x), x]
+        for block in self.DDCNN:
+            x = block(x)
+            if shortcut is None:
+                shortcut = x
+        # shape of x[i] is [B, 3, T, H, W], concat in channel dimension
+        if self.octave:
+            x = paddle.concat([x[0], self.pool(x[1])], axis=1)
+
+        x = functional.relu(x)
+
+        if self.shortcut is not None:
+            if self.stochastic_depth_drop_prob != 0.:
+                if self.training:
+                    if random.random() < self.stochastic_depth_drop_prob:
+                        x = shortcut
+                    else:
+                        x = x + shortcut
+                else:
+                    x = (1 - self.stochastic_depth_drop_prob) * x + shortcut
+            else:
+                x += shortcut
+
+        if not self.octave:
+            x = self.pool(x)
+        return x
+
+
+class ResNetBlock(nn.Layer):
+    def __init__(self, in_filters, filters, strides=(1, 1)):
+        super(ResNetBlock, self).__init__()
+
+        self.conv1 = nn.Conv2D(in_filters, filters, kernel_size=(3, 3), stride=strides, padding=(1, 1),
+                               weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                               bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(filters,
+                                  weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=1.)),
+                                  bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))
+
+        self.conv2 = nn.Conv2D(filters, filters, kernel_size=(3, 3), padding=(1, 1),
+                               weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                               bias_attr=False)
+        self.bn2 = nn.BatchNorm2D(filters,
+                                  weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)),
+                                  bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))
+
+    def forward(self, inputs):
+        x = self.conv1(inputs)
+        x = self.bn1(x)
+        x = functional.relu(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+
+        shortcut = inputs
+        x += shortcut
+
+        return functional.relu(x)
+
+
+class ResNetFeatures(nn.Layer):
+    def __init__(self, in_filters=3,
+                 mean=[0.485, 0.456, 0.406],
+                 std=[0.229, 0.224, 0.225]):
+        super(ResNetFeatures, self).__init__()
+        self.conv1 = nn.Conv2D(in_channels=in_filters, out_channels=64, kernel_size=(7, 7),
+                               stride=(2, 2), padding=(3, 3),
+                               weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                               bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(num_features=64, momentum=0.99, epsilon=1e-03,
+                                  weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=1.)),
+                                  bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))
+                                  )
+        self.max_pool = nn.MaxPool2D(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+
+        self.layer2a = ResNetBlock(64, 64)
+        self.layer2b = ResNetBlock(64, 64)
+
+        self.mean = paddle.to_tensor(mean)
+        self.std = paddle.to_tensor(std)
+
+    def forward(self, inputs):
+        shape = inputs.shape
+        x = paddle.reshape(inputs, [shape[0] * shape[2], shape[1], shape[3], shape[4]])
+        x = (x - self.mean) / self.std
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = functional.relu(x)
+        x = self.max_pool(x)
+        x = self.layer2a(x)
+        x = self.layer2b(x)
+
+        new_shape = x.shape
+        x = paddle.reshape(x, [shape[0], new_shape[1], shape[2], new_shape[2], new_shape[3]])
+        return x
+
+
+class FrameSimilarity(nn.Layer):
+    def __init__(self,
+                 in_filters,
+                 similarity_dim=128,
+                 lookup_window=101,
+                 output_dim=128,
+                 stop_gradient=False,
+                 use_bias=False):
+        super(FrameSimilarity, self).__init__()
+        self.projection = nn.Linear(in_filters, similarity_dim,
+                                    weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                                    bias_attr=use_bias)
+        self.fc = nn.Linear(lookup_window, output_dim,
+                            weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                            bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))
+
+        self.lookup_window = lookup_window
+        self.stop_gradient = stop_gradient
+        assert lookup_window % 2 == 1, "`lookup_window` must be odd integer"
+
+    def forward(self, inputs):
+        x = paddle.concat([paddle.mean(x, axis=[3, 4]) for x in inputs], axis=1)
+        x = paddle.transpose(x, (0, 2, 1))
+
+        if self.stop_gradient:
+            x = x.stop_gradient
+
+        x = self.projection(x)
+        x = functional.normalize(x, p=2, axis=2)
+        batch_size = paddle.slice(x.shape, starts=[0], ends=[1], axes=[0]) if x.shape[0] == -1 else x.shape[0]
+        time_window = x.shape[1]
+        similarities = paddle.bmm(x, x.transpose([0, 2, 1]))  # [batch_size, time_window, time_window]
+
+        similarities_padded = functional.pad(similarities,
+                                             [(self.lookup_window - 1) // 2, (self.lookup_window - 1) // 2],
+                                             data_format='NCL')
+
+        batch_indices = paddle.arange(0, batch_size).reshape([batch_size, 1, 1])
+        batch_indices = paddle.tile(batch_indices, [1, time_window, self.lookup_window])
+        time_indices = paddle.arange(0, time_window).reshape([1, time_window, 1])
+        time_indices = paddle.tile(time_indices, [batch_size, 1, self.lookup_window])
+        lookup_indices = paddle.arange(0, self.lookup_window).reshape([1, 1, self.lookup_window])
+        lookup_indices = paddle.tile(lookup_indices, [batch_size, time_window, 1]) + time_indices
+        indices = paddle.stack([batch_indices, time_indices, lookup_indices], -1)
+        similarities = paddle.gather_nd(similarities_padded, indices)
+        return functional.relu(self.fc(similarities))
+
+
+class ConvexCombinationRegularization(nn.Layer):
+    def __init__(self, in_filters, filters=32, delta_scale=10., loss_weight=0.01):
+        super(ConvexCombinationRegularization, self).__init__()
+
+        self.projection = nn.Conv3D(in_filters, filters, kernel_size=1, dilation=1, padding=(0, 0, 0),
+                                    weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                                    bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))
+        self.features = nn.Conv3D((filters * 3), filters * 2,
+                                  kernel_size=(3, 3, 3), dilation=1, padding=(1, 1, 1),
+                                  weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                                  bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))
+        self.dense = nn.Linear(64, 1, weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()), bias_attr=True)
+        self.loss = nn.SmoothL1Loss(reduction='none')
+        self.delta_scale = delta_scale
+        self.loss_weight = loss_weight
+
+    def forward(self, image_inputs, feature_inputs):
+        x = feature_inputs
+        x = self.projection(x)
+        x = functional.relu(x)
+        batch_size = x.shape[0]
+        window_size = x.shape[2]
+        first_frame = paddle.tile(x[:, :, :1], [1, 1, window_size, 1, 1])
+        last_frame = paddle.tile(x[:, :, -1:], [1, 1, window_size, 1, 1])
+        x = paddle.concat([x, first_frame, last_frame], 1)
+        x = self.features(x)
+        x = functional.relu(x)
+        x = paddle.mean(x, axis=[3, 4])
+        x = paddle.transpose(x, (0, 2, 1))
+        alpha = self.dense(x)
+        alpha = paddle.transpose(alpha, (0, 2, 1))
+
+        first_img = paddle.tile(image_inputs[:, :, :1], [1, 1, window_size, 1, 1])
+        last_img = paddle.tile(image_inputs[:, :, -1:], [1, 1, window_size, 1, 1])
+
+        alpha_ = functional.sigmoid(alpha)
+        alpha_ = paddle.reshape(alpha_, [batch_size, 1, window_size, 1, 1])
+        predictions_ = (alpha_ * first_img + (1 - alpha_) * last_img)
+        loss_ = self.loss(label=image_inputs / self.delta_scale, input=predictions_ / self.delta_scale)
+        loss_ = self.loss_weight * paddle.mean(loss_)
+        return alpha, loss_
+
+
+class ColorHistograms(nn.Layer):
+    def __init__(self,
+                 lookup_window=101,
+                 output_dim=None):
+        super(ColorHistograms, self).__init__()
+
+        self.fc = nn.Linear(lookup_window, output_dim,
+                            weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                            bias_attr=ParamAttr(
+                                initializer=nn.initializer.Constant(value=0.))) if output_dim is not None else None
+        self.lookup_window = lookup_window
+        assert lookup_window % 2 == 1, "`lookup_window` must be odd integer"
+
+    def compute_color_histograms(self, frames):
+        frames = frames.astype('int32')
+
+        def get_bin(frames):
+            # returns 0 .. 511
+            R, G, B = frames[:, :, 0], frames[:, :, 1], frames[:, :, 2]
+            R, G, B = R // 32, G // 32, B // 32
+            return (R * 64) + (G * 8) + B
+
+        batch_size = paddle.slice(frames.shape, starts=[0], ends=[1], axes=[0]) if frames.shape[0] == -1 else frames.shape[0]
+        time_window, height, width, no_channels = frames.shape[1:]
+
+        assert no_channels == 3 or no_channels == 6
+        if no_channels == 3:
+            frames_flatten = frames.reshape([-1, height * width, 3])
+        else:
+            frames_flatten = frames.reshape([-1, height * width * 2, 3])
+
+        binned_values = get_bin(frames_flatten)
+
+        frame_bin_prefix = (paddle.arange(0, batch_size * time_window) * 512).reshape([-1, 1])
+        binned_values = (binned_values + frame_bin_prefix).reshape([-1, 1])
+        histograms = paddle.zeros_like(frame_bin_prefix, dtype='int32').tile([512]).reshape([-1])
+        histograms = histograms.scatter_nd_add(binned_values, paddle.ones_like(binned_values, dtype='int32').reshape([-1]))
+        histograms = histograms.reshape([batch_size, time_window, 512]).astype('float32')
+        histograms_normalized = functional.normalize(histograms, p=2, axis=2)
+        return histograms_normalized
+
+    def forward(self, inputs):
+        x = self.compute_color_histograms(inputs)
+        batch_size = paddle.slice(x.shape, starts=[0], ends=[1], axes=[0]) if x.shape[0] == -1 else x.shape[0]
+        time_window = x.shape[1]
+        similarities = paddle.bmm(x, x.transpose([0, 2, 1]))  # [batch_size, time_window, time_window]
+        similarities_padded = functional.pad(similarities,
+                                             [(self.lookup_window - 1) // 2, (self.lookup_window - 1) // 2],
+                                             data_format='NCL')
+
+        batch_indices = paddle.arange(0, batch_size).reshape([batch_size, 1, 1])
+        batch_indices = paddle.tile(batch_indices, [1, time_window, self.lookup_window])
+        time_indices = paddle.arange(0, time_window).reshape([1, time_window, 1])
+        time_indices = paddle.tile(time_indices, [batch_size, 1, self.lookup_window])
+        lookup_indices = paddle.arange(0, self.lookup_window).reshape([1, 1, self.lookup_window])
+        lookup_indices = paddle.tile(lookup_indices, [batch_size, time_window, 1]) + time_indices
+
+        indices = paddle.stack([batch_indices, time_indices, lookup_indices], -1)
+        similarities = paddle.gather_nd(similarities_padded, indices)
+
+        if self.fc is not None:
+            return functional.relu(self.fc(similarities))
+        return similarities
+
+
+@BACKBONES.register()
+class TransNetV2(nn.Layer):
+    """TransNetV2 model from
+    `"TransNet V2: An effective deep network architecture for fast shot transition detection" <https://arxiv.org/abs/2008.04838>`_
+    """
+    def __init__(self,
+                 F=16, L=3, S=2, D=1024,
+                 use_many_hot_targets=True,
+                 use_frame_similarity=True,
+                 use_color_histograms=True,
+                 use_mean_pooling=False,
+                 dropout_rate=0.5,
+                 use_convex_comb_reg=False,
+                 use_resnet_features=False,
+                 use_resnet_like_top=False,
+                 frame_similarity_on_last_layer=False,
+                 mean=[0.485, 0.456, 0.406],
+                 std=[0.229, 0.224, 0.225]):
+        super(TransNetV2, self).__init__()
+
+        self.mean = np.array(mean, np.float32).reshape([1, 3, 1, 1]) * 255
+        self.std = np.array(std, np.float32).reshape([1, 3, 1, 1]) * 255
+
+        self.use_resnet_features = use_resnet_features
+        self.resnet_layers = ResNetFeatures(in_filters=3, mean=self.mean, std=self.std) if self.use_resnet_features else None
+        self.resnet_like_top = use_resnet_like_top
+        if self.resnet_like_top:
+            self.resnet_like_top_conv = nn.Conv3D(64 if self.use_resnet_features else 3, 32, kernel_size=(3, 7, 7),
+                                                  stride=(1, 2, 2),
+                                                  padding=(1, 3, 3),
+                                                  weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                                                  bias_attr=False)
+            self.resnet_like_top_bn = nn.BatchNorm3D(32, momentum=0.99, epsilon=1e-03,
+                                                     weight_attr=ParamAttr(
+                                                         initializer=nn.initializer.Constant(value=1.)),
+                                                     bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)))
+            self.resnet_like_top_max_pool = nn.MaxPool3D(kernel_size=(1, 3, 3), stride=(1, 2, 2),
+                                                         padding=(0, 1, 1))
+
+        if self.resnet_like_top:
+            in_filters = 32
+        elif self.use_resnet_features:
+            in_filters = 64
+        else:
+            in_filters = 3
+        self.SDDCNN = nn.LayerList(
+            [StackedDDCNNV2(in_filters=in_filters, n_blocks=S, filters=F,
+                            stochastic_depth_drop_prob=0.)] +
+            [StackedDDCNNV2(in_filters=(F * 2 ** (i - 1)) * 4, n_blocks=S, filters=F * 2 ** i) for i in range(1, L)]
+        )
+
+        self.frame_sim_layer = FrameSimilarity(
+            sum([(F * 2 ** i) * 4 for i in range(L)]), lookup_window=101, output_dim=128, similarity_dim=128,
+            use_bias=True
+        ) if use_frame_similarity else None
+        self.color_hist_layer = ColorHistograms(
+            lookup_window=101, output_dim=128
+        ) if use_color_histograms else None
+
+        self.dropout = nn.Dropout(dropout_rate) if dropout_rate is not None else None
+
+        output_dim = ((F * 2 ** (L - 1)) * 4) * 3 * 6  # 3x6 for spatial dimensions
+        if use_frame_similarity: output_dim += 128
+        if use_color_histograms: output_dim += 128
+
+        self.use_mean_pooling = use_mean_pooling
+
+        self.has_downsample = False
+        if self.use_resnet_features or self.resnet_like_top or self.use_mean_pooling:
+            self.has_downsample = True
+        self.fc1 = nn.Linear(512 if self.has_downsample else output_dim, D,
+                             weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                             bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))
+                             )
+        self.frame_similarity_on_last_layer = frame_similarity_on_last_layer
+        self.cls_layer1 = nn.Linear(1152 if self.frame_similarity_on_last_layer else D, 1,
+                                    weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                                    bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))
+                                    )
+        self.cls_layer2 = nn.Linear(1152 if self.frame_similarity_on_last_layer else D, 1,
+                                    weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()),
+                                    bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))
+                                    ) if use_many_hot_targets else None
+
+        self.convex_comb_reg = ConvexCombinationRegularization(
+            in_filters=(F * 2 ** (L - 1) * 4)) if use_convex_comb_reg else None
+
+    def forward(self, inputs):
+        assert list(inputs.shape[2:]) == [27, 48, 3] and inputs.dtype == paddle.float32, \
+            "incorrect input type and/or shape"
+        out_dict = {}
+
+        # shape [B, T, H, W, 3] to shape [B, 3, T, H, W]
+        x = inputs.transpose([0, 4, 1, 2, 3])
+        if self.use_resnet_features:
+            x = self.resnet_layers(x)
+        else:
+            x = x / 255.
+        inputs = inputs.clip(min=0).astype('uint8')
+        if self.resnet_like_top:
+            x = self.resnet_like_top_conv(x)
+            x = self.resnet_like_top_bn(x)
+            x = self.resnet_like_top_max_pool(x)
+        block_features = []
+        for block in self.SDDCNN:
+            x = block(x)
+            block_features.append(x)
+        if self.convex_comb_reg is not None:
+            out_dict["alphas"], out_dict["comb_reg_loss"] = self.convex_comb_reg(inputs.transpose([0, 4, 1, 2, 3]), x)
+        if self.use_mean_pooling:
+            x = paddle.mean(x, axis=[3, 4])
+            x = x.transpose([0, 2, 1])
+        else:
+            x = x.transpose([0, 2, 3, 4, 1])
+            x = x.reshape([x.shape[0], x.shape[1], x.shape[2]*x.shape[3]*x.shape[4]])
+        if self.frame_sim_layer is not None:
+            x = paddle.concat([self.frame_sim_layer(block_features), x], 2)
+        if self.color_hist_layer is not None:
+            x = paddle.concat([self.color_hist_layer(inputs), x], 2)
+        x = self.fc1(x)
+        x = functional.relu(x)
+        if self.dropout is not None:
+            x = self.dropout(x)
+        if self.frame_sim_layer is not None and self.frame_similarity_on_last_layer:
+            x = paddle.concat([self.frame_sim_layer(block_features), x], 2)
+        one_hot = self.cls_layer1(x)
+        if self.cls_layer2 is not None:
+            out_dict["many_hot"] = self.cls_layer2(x)
+
+        if len(out_dict) > 0:
+            return one_hot, out_dict
+
+        return one_hot
+
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/vit.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/vit.py
new file mode 100644
index 0000000..84f434f
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/vit.py
@@ -0,0 +1,465 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Callable
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import Constant
+
+from ...utils import load_ckpt
+from ..registry import BACKBONES
+from ..weight_init import trunc_normal_
+
+__all__ = ['VisionTransformer']
+
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def to_2tuple(x):
+    return tuple([x] * 2)
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    # issuecomment-532968956 ...
+    See discussion: https://github.com/tensorflow/tpu/issues/494
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
+    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.0):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.0,
+                 proj_drop=0.0):
+        super().__init__()
+
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.attn_drop = nn.Dropout(attn_drop)
+
+    def forward(self, x):
+        N, C = x.shape[1:]
+        qkv = self.qkv(x).reshape(
+            (-1, N, 3, self.num_heads, C // self.num_heads)).transpose(
+                (2, 0, 3, 1, 4))
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.0,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.0,
+                 attn_drop=0.0,
+                 drop_path=0.1,
+                 act_layer=nn.GELU,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 attention_type='divided_space_time'):
+
+        super().__init__()
+        self.attention_type = attention_type
+        if isinstance(norm_layer, str):
+            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+        elif isinstance(norm_layer, Callable):
+            self.norm1 = norm_layer(dim, epsilon=epsilon)
+        else:
+            raise TypeError(
+                "The norm_layer must be str or paddle.nn.layer.Layer class")
+
+        self.attn = Attention(dim,
+                              num_heads=num_heads,
+                              qkv_bias=qkv_bias,
+                              qk_scale=qk_scale,
+                              attn_drop=attn_drop,
+                              proj_drop=drop)
+
+        # Temporal Attention Parameters
+        if self.attention_type == 'divided_space_time':
+            if isinstance(norm_layer, str):
+                self.temporal_norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+            elif isinstance(norm_layer, Callable):
+                self.temporal_norm1 = norm_layer(dim, epsilon=epsilon)
+            else:
+                raise TypeError(
+                    "The norm_layer must be str or paddle.nn.layer.Layer class")
+            self.temporal_attn = Attention(dim,
+                                           num_heads=num_heads,
+                                           qkv_bias=qkv_bias,
+                                           qk_scale=qk_scale,
+                                           attn_drop=attn_drop,
+                                           proj_drop=drop)
+            self.temporal_fc = nn.Linear(dim, dim)
+
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        if isinstance(norm_layer, str):
+            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+        elif isinstance(norm_layer, Callable):
+            self.norm2 = norm_layer(dim, epsilon=epsilon)
+        else:
+            raise TypeError(
+                "The norm_layer must be str or paddle.nn.layer.Layer class")
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+    def forward(self, x, B, T, W):
+        num_spatial_tokens = (x.shape[1] - 1) // T
+        H = num_spatial_tokens // W
+        if self.attention_type in ['space_only', 'joint_space_time']:
+            x = x + self.drop_path(self.attn(self.norm1(x)))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+            return x
+        elif self.attention_type == 'divided_space_time':
+            ########## Temporal ##########
+            xt = x[:, 1:, :]
+            _, _, _, _t, _m = B, H, W, T, xt.shape[-1]
+            xt = xt.reshape([-1, _t, _m])
+
+            res_temporal = self.drop_path(
+                self.temporal_attn(self.temporal_norm1(xt)))
+
+            _, _h, _w, _t, _m = B, H, W, T, res_temporal.shape[-1]
+            res_temporal = res_temporal.reshape([-1, _h * _w * _t, _m])
+
+            res_temporal = self.temporal_fc(res_temporal)
+            xt = x[:, 1:, :] + res_temporal
+
+            ########## Spatial ##########
+            init_cls_token = x[:, 0, :].unsqueeze(1)
+            cls_token = init_cls_token.tile((1, T, 1))
+            _b, _t, _m = cls_token.shape
+            cls_token = cls_token.reshape([-1, _m]).unsqueeze(1)
+
+            xs = xt
+            _, _h, _w, _t, _m = B, H, W, T, xs.shape[-1]
+            xs = xs.reshape([-1, _h, _w, _t, _m]).transpose(
+                (0, 3, 1, 2, 4)).reshape([-1, _h * _w, _m])
+            xs = paddle.concat((cls_token, xs), axis=1)
+            res_spatial = self.drop_path(self.attn(self.norm1(xs)))
+
+            # Taking care of CLS token
+            cls_token = res_spatial[:, 0, :]
+            _, _t, _m = B, T, cls_token.shape[-1]
+            cls_token = cls_token.reshape([-1, _t, _m])
+            # averaging for every frame
+            cls_token = paddle.mean(cls_token, axis=1, keepdim=True)
+
+            res_spatial = res_spatial[:, 1:, :]
+            _, _t, _h, _w, _m = B, T, H, W, res_spatial.shape[-1]
+            res_spatial = res_spatial.reshape([-1, _t, _h, _w, _m]).transpose(
+                (0, 2, 3, 1, 4)).reshape([-1, _h * _w * _t, _m])
+
+            res = res_spatial
+            x = xt
+            x = paddle.concat((init_cls_token, x), axis=1) + paddle.concat(
+                (cls_token, res), axis=1)
+
+            # Mlp
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+            return x
+        else:
+            raise NotImplementedError
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] //
+                                                        patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2D(in_channels,
+                              embed_dim,
+                              kernel_size=patch_size,
+                              stride=patch_size)
+
+    def forward(self, x):
+        B, C, T, H, W = x.shape
+
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = x.transpose((0, 2, 1, 3, 4))
+        x = x.reshape([-1, C, H, W])
+        x = self.proj(x)
+        W = x.shape[-1]
+        x = x.flatten(2).transpose((0, 2, 1))
+        return x, T, W
+
+
+@BACKBONES.register()
+class VisionTransformer(nn.Layer):
+    """ Vision Transformer with support for patch input
+    """
+    def __init__(self,
+                 pretrained=None,
+                 img_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 num_seg=8,
+                 attention_type='divided_space_time',
+                 **args):
+        super().__init__()
+        self.pretrained = pretrained
+        self.num_seg = num_seg
+        self.attention_type = attention_type
+        self.num_features = self.embed_dim = embed_dim
+
+        self.patch_embed = PatchEmbed(img_size=img_size,
+                                      patch_size=patch_size,
+                                      in_channels=in_channels,
+                                      embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        # Positional Embeddings
+        self.cls_token = self.create_parameter(shape=(1, 1, embed_dim),
+                                               default_initializer=zeros_)
+        self.pos_embed = self.create_parameter(shape=(1, num_patches + 1,
+                                                      embed_dim),
+                                               default_initializer=zeros_)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if self.attention_type != 'space_only':
+            self.time_embed = self.create_parameter(shape=(1, num_seg,
+                                                           embed_dim),
+                                                    default_initializer=zeros_)
+            self.time_drop = nn.Dropout(p=drop_rate)
+
+        self.add_parameter("pos_embed", self.pos_embed)
+        self.add_parameter("cls_token", self.cls_token)
+
+        dpr = np.linspace(0, drop_path_rate, depth)
+
+        self.blocks = nn.LayerList([
+            Block(dim=embed_dim,
+                  num_heads=num_heads,
+                  mlp_ratio=mlp_ratio,
+                  qkv_bias=qkv_bias,
+                  qk_scale=qk_scale,
+                  drop=drop_rate,
+                  attn_drop=attn_drop_rate,
+                  drop_path=dpr[i],
+                  norm_layer=norm_layer,
+                  epsilon=epsilon,
+                  attention_type=self.attention_type) for i in range(depth)
+        ])
+
+        self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)
+
+    def init_weights(self):
+        """First init model's weight"""
+        trunc_normal_(self.pos_embed, std=0.02)
+        trunc_normal_(self.cls_token, std=0.02)
+        self.apply(self._init_fn)
+
+        if self.attention_type == 'divided_space_time':
+            i = 0
+            for m in self.blocks.sublayers(include_self=True):
+                m_str = str(m)
+                if 'Block' in m_str:
+                    if i > 0:
+                        zeros_(m.temporal_fc.weight)
+                        zeros_(m.temporal_fc.bias)
+                    i += 1
+        """Second, if provide pretrained ckpt, load it"""
+        if isinstance(
+                self.pretrained, str
+        ) and self.pretrained.strip() != "":  # load pretrained weights
+            load_ckpt(self,
+                      self.pretrained,
+                      num_patches=self.patch_embed.num_patches,
+                      num_seg=self.num_seg,
+                      attention_type=self.attention_type)
+
+    def _init_fn(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            ones_(m.weight)
+            zeros_(m.bias)
+
+    def forward_features(self, x):
+        # B = x.shape[0]
+        B = paddle.shape(x)[0]
+        x, T, W = self.patch_embed(x)  # [BT,nH*nW,F]
+        cls_tokens = self.cls_token.expand((B * T, -1, -1))  # [1,1,F]->[BT,1,F]
+        x = paddle.concat((cls_tokens, x), axis=1)
+        pos_interp = (x.shape[1] != self.pos_embed.shape[1])
+        if pos_interp:
+            pos_embed = self.pos_embed
+            cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1)
+            other_pos_embed = pos_embed[0, 1:, :].unsqueeze(0).transpose(
+                (0, 2, 1))
+            P = int(other_pos_embed.shape[2]**0.5)
+            H = x.shape[1] // W
+            other_pos_embed = other_pos_embed.reshape([1, x.shape[2], P, P])
+            new_pos_embed = F.interpolate(other_pos_embed,
+                                          size=(H, W),
+                                          mode='nearest')
+            new_pos_embed = new_pos_embed.flatten(2)
+            new_pos_embed = new_pos_embed.transpose((0, 2, 1))
+            new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed),
+                                          axis=1)
+            x = x + new_pos_embed
+        else:
+            x = x + self.pos_embed
+
+        x = self.pos_drop(x)
+
+        # Time Embeddings
+        if self.attention_type != 'space_only':
+            cls_tokens = x[:B, 0, :].unsqueeze(1) if B > 0 else x.split(
+                T)[0].index_select(paddle.to_tensor([0]), axis=1)
+            x = x[:, 1:]
+            _, _n, _m = x.shape
+            _t = T
+            x = x.reshape([-1, _t, _n, _m]).transpose(
+                (0, 2, 1, 3)).reshape([-1, _t, _m])
+            # Resizing time embeddings in case they don't match
+            time_interp = (T != self.time_embed.shape[1])
+            if time_interp:  # T' != T
+                time_embed = self.time_embed.transpose((0, 2, 1)).unsqueeze(0)
+                new_time_embed = F.interpolate(time_embed,
+                                               size=(T, x.shape[-1]),
+                                               mode='nearest').squeeze(0)
+                new_time_embed = new_time_embed.transpose((0, 2, 1))
+                x = x + new_time_embed
+            else:
+                x = x + self.time_embed
+
+            x = self.time_drop(x)
+            _, _t, _m = x.shape
+            x = x.reshape([-1, W * W * T, _m])
+            x = paddle.concat((cls_tokens, x), axis=1)
+
+        # Attention blocks
+        for blk in self.blocks:
+            x = blk(x, B, T, W)
+
+        # Predictions for space-only baseline
+        if self.attention_type == 'space_only':
+            _, _n, _m = x.shape
+            _t = T
+            x = x.reshape([-1, _t, _n, _m])
+            x = paddle.mean(x, 1)  # averaging predictions for every frame
+
+        x = self.norm(x)
+        return x[:, 0]  # [B,  embed_dim]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/vit_tweaks.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/vit_tweaks.py
new file mode 100644
index 0000000..a20af30
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/vit_tweaks.py
@@ -0,0 +1,515 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Callable
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import Constant
+from paddle.regularizer import L2Decay
+
+from ...utils import load_ckpt
+from ..registry import BACKBONES
+from ..weight_init import trunc_normal_
+
+__all__ = ['VisionTransformer_tweaks']
+
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def to_2tuple(x):
+    return tuple([x] * 2)
+
+
+def rand_bbox(size, lam):
+    """ rand_bbox """
+    w = size[2]
+    h = size[3]
+    cut_rat = np.sqrt(1. - lam)
+    cut_w = np.int(w * cut_rat)
+    cut_h = np.int(h * cut_rat)
+
+    # uniform
+    cx = np.random.randint(w)
+    cy = np.random.randint(h)
+
+    bbx1 = np.clip(cx - cut_w // 2, 0, w)
+    bby1 = np.clip(cy - cut_h // 2, 0, h)
+    bbx2 = np.clip(cx + cut_w // 2, 0, w)
+    bby2 = np.clip(cy + cut_h // 2, 0, h)
+
+    return bbx1, bby1, bbx2, bby2
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    # issuecomment-532968956 ...
+    See discussion: https://github.com/tensorflow/tpu/issues/494
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob)
+    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.,
+                 wd_bias=True,
+                 lr_mult=1.0):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 wd_bias=True,
+                 lr_mult=1.0):
+        super().__init__()
+
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.attn_drop = nn.Dropout(attn_drop)
+
+    def forward(self, x):
+        N, C = x.shape[1:]
+        qkv = self.qkv(x).reshape(
+            (-1, N, 3, self.num_heads, C // self.num_heads)).transpose(
+                (2, 0, 3, 1, 4))
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.0,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.0,
+                 attn_drop=0.0,
+                 drop_path=0.1,
+                 act_layer=nn.GELU,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 attention_type='divided_space_time',
+                 wd_bias=True,
+                 lr_mult=1.0):
+
+        super().__init__()
+        self.attention_type = attention_type
+        if isinstance(norm_layer, str):
+            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+        elif isinstance(norm_layer, Callable):
+            self.norm1 = norm_layer(dim, epsilon=epsilon)
+        else:
+            raise TypeError(
+                "The norm_layer must be str or paddle.nn.layer.Layer class")
+
+        self.attn = Attention(dim,
+                              num_heads=num_heads,
+                              qkv_bias=qkv_bias,
+                              qk_scale=qk_scale,
+                              attn_drop=attn_drop,
+                              proj_drop=drop,
+                              wd_bias=wd_bias,
+                              lr_mult=lr_mult)
+
+        # Temporal Attention Parameters
+        if self.attention_type == 'divided_space_time':
+            if isinstance(norm_layer, str):
+                self.temporal_norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+            elif isinstance(norm_layer, Callable):
+                self.temporal_norm1 = norm_layer(dim, epsilon=epsilon)
+            else:
+                raise TypeError(
+                    "The norm_layer must be str or paddle.nn.layer.Layer class")
+            self.temporal_attn = Attention(dim,
+                                           num_heads=num_heads,
+                                           qkv_bias=qkv_bias,
+                                           qk_scale=qk_scale,
+                                           attn_drop=attn_drop,
+                                           proj_drop=drop,
+                                           wd_bias=wd_bias,
+                                           lr_mult=lr_mult)
+            self.temporal_fc = nn.Linear(dim, dim)
+
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        if isinstance(norm_layer, str):
+            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+        elif isinstance(norm_layer, Callable):
+            self.norm2 = norm_layer(dim, epsilon=epsilon)
+        else:
+            raise TypeError(
+                "The norm_layer must be str or paddle.nn.layer.Layer class")
+
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop,
+                       wd_bias=wd_bias,
+                       lr_mult=lr_mult)
+
+    def forward(self, x, B, T, W):
+        num_spatial_tokens = (x.shape[1] - 1) // T
+        H = num_spatial_tokens // W
+        if self.attention_type in ['space_only', 'joint_space_time']:
+            x = paddle.add(x, self.drop_path(self.attn(self.norm1(x))))
+            x = paddle.add(x, self.drop_path(self.mlp(self.norm2(x))))
+            return x
+        elif self.attention_type == 'divided_space_time':
+            ########## Temporal ##########
+            xt = x[:, 1:, :]
+            _, _, _, _t, _m = B, H, W, T, xt.shape[-1]
+            xt = xt.reshape([-1, _t, _m])
+
+            res_temporal = self.drop_path(
+                self.temporal_attn(self.temporal_norm1(xt)))
+
+            _, _h, _w, _t, _m = B, H, W, T, res_temporal.shape[-1]
+            res_temporal = res_temporal.reshape([-1, _h * _w * _t, _m])
+
+            res_temporal = self.temporal_fc(res_temporal)
+            xt = paddle.add(x[:, 1:, :], res_temporal)
+
+            ########## Spatial ##########
+            init_cls_token = x[:, 0, :].unsqueeze(1)
+            cls_token = init_cls_token.tile((1, T, 1))
+            _b, _t, _m = cls_token.shape
+            cls_token = cls_token.reshape([-1, _m]).unsqueeze(1)
+
+            xs = xt
+            _, _h, _w, _t, _m = B, H, W, T, xs.shape[-1]
+            xs = xs.reshape([-1, _h, _w, _t, _m]).transpose(
+                (0, 3, 1, 2, 4)).reshape([-1, _h * _w, _m])
+            xs = paddle.concat((cls_token, xs), axis=1)
+            res_spatial = self.drop_path(self.attn(self.norm1(xs)))
+
+            # Taking care of CLS token
+            cls_token = res_spatial[:, 0, :]
+            _, _t, _m = B, T, cls_token.shape[-1]
+            cls_token = cls_token.reshape([-1, _t, _m])
+            # averaging for every frame
+            cls_token = paddle.mean(cls_token, axis=1, keepdim=True)
+
+            res_spatial = res_spatial[:, 1:, :]
+            _, _t, _h, _w, _m = B, T, H, W, res_spatial.shape[-1]
+            res_spatial = res_spatial.reshape([-1, _t, _h, _w, _m]).transpose(
+                (0, 2, 3, 1, 4)).reshape([-1, _h * _w * _t, _m])
+
+            res = res_spatial
+            x = xt
+            x = paddle.add(paddle.concat((init_cls_token, x), axis=1),
+                           paddle.concat((cls_token, res), axis=1))
+            # Mlp
+            x = paddle.add(x, self.drop_path(self.mlp(self.norm2(x))))
+            return x
+        else:
+            raise NotImplementedError
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dim=768,
+                 wd_bias=True,
+                 lr_mult=1.0):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] //
+                                                        patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2D(in_channels,
+                              embed_dim,
+                              kernel_size=patch_size,
+                              stride=patch_size)
+
+    def forward(self, x):
+        B, C, T, H, W = x.shape
+
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = x.transpose((0, 2, 1, 3, 4))  # [B,T,C,H,W]
+        x = x.reshape([-1, C, H, W])  # [BT,C,H,W]
+        x = self.proj(x)  # [BT,F,nH,nW]
+        W = x.shape[-1]
+        x = x.flatten(2).transpose((0, 2, 1))  # [BT,F,nHnW]
+        return x, T, W
+
+
+@BACKBONES.register()
+class VisionTransformer_tweaks(nn.Layer):
+    """ Vision Transformer with support for patch input
+    """
+    def __init__(self,
+                 pretrained=None,
+                 img_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 num_seg=8,
+                 attention_type='divided_space_time',
+                 wd_bias=True,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 **args):
+        super().__init__()
+        self.pretrained = pretrained
+        self.num_seg = num_seg
+        self.attention_type = attention_type
+        self.lr_mult_list = lr_mult_list
+        self.num_features = self.embed_dim = embed_dim
+
+        self.patch_embed = PatchEmbed(img_size=img_size,
+                                      patch_size=patch_size,
+                                      in_channels=in_channels,
+                                      embed_dim=embed_dim,
+                                      wd_bias=wd_bias,
+                                      lr_mult=self.lr_mult_list[0])
+        num_patches = self.patch_embed.num_patches
+
+        # Positional Embeddings
+        self.cls_token = self.create_parameter(
+            shape=(1, 1, embed_dim),
+            default_initializer=zeros_,
+            attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.pos_embed = self.create_parameter(
+            shape=(1, num_patches + 1, embed_dim),
+            default_initializer=zeros_,
+            attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if self.attention_type != 'space_only':
+            self.time_embed = self.create_parameter(
+                shape=(1, num_seg, embed_dim),
+                default_initializer=zeros_,
+                attr=ParamAttr(regularizer=L2Decay(0.0)))
+            self.time_drop = nn.Dropout(p=drop_rate)
+
+        self.add_parameter("pos_embed", self.pos_embed)
+        self.add_parameter("cls_token", self.cls_token)
+
+        dpr = np.linspace(0, drop_path_rate, depth)
+
+        self.blocks = nn.LayerList([
+            Block(dim=embed_dim,
+                  num_heads=num_heads,
+                  mlp_ratio=mlp_ratio,
+                  qkv_bias=qkv_bias,
+                  qk_scale=qk_scale,
+                  drop=drop_rate,
+                  attn_drop=attn_drop_rate,
+                  drop_path=dpr[i],
+                  norm_layer=norm_layer,
+                  epsilon=epsilon,
+                  attention_type=self.attention_type,
+                  wd_bias=wd_bias,
+                  lr_mult=self.lr_mult_list[(i // 4) + 1]) for i in range(depth)
+        ])
+
+        self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)
+
+    def init_weights(self):
+        """First init model's weight"""
+        trunc_normal_(self.pos_embed, std=0.02)
+        trunc_normal_(self.cls_token, std=0.02)
+        self.apply(self._init_fn)
+
+        if self.attention_type == 'divided_space_time':
+            i = 0
+            for m in self.blocks.sublayers(include_self=True):
+                m_str = str(m)
+                if 'Block' in m_str:
+                    if i > 0:
+                        zeros_(m.temporal_fc.weight)
+                        zeros_(m.temporal_fc.bias)
+                    i += 1
+        """Second, if provide pretrained ckpt, load it"""
+        if isinstance(
+                self.pretrained, str
+        ) and self.pretrained.strip() != "":  # load pretrained weights
+            load_ckpt(self,
+                      self.pretrained,
+                      num_patches=self.patch_embed.num_patches,
+                      num_seg=self.num_seg,
+                      attention_type=self.attention_type)
+        elif self.pretrained is None or self.pretrained.strip() == "":
+            pass
+        else:
+            raise NotImplementedError
+
+    def _init_fn(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            ones_(m.weight)
+            zeros_(m.bias)
+
+    def forward_features(self, x):
+        # B = x.shape[0]
+        B = paddle.shape(x)[0]
+        x, T, W = self.patch_embed(x)  # [BT,nH*nW,F]
+        cls_tokens = self.cls_token.expand((B * T, -1, -1))  # [1,1,F]->[BT,1,F]
+        x = paddle.concat((cls_tokens, x), axis=1)
+        pos_interp = (x.shape[1] != self.pos_embed.shape[1])
+        if pos_interp:
+            pos_embed = self.pos_embed
+            cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1)
+            other_pos_embed = pos_embed[0, 1:, :].unsqueeze(0).transpose(
+                (0, 2, 1))
+            P = int(other_pos_embed.shape[2]**0.5)
+            H = x.shape[1] // W
+            other_pos_embed = other_pos_embed.reshape([1, x.shape[2], P, P])
+            new_pos_embed = F.interpolate(other_pos_embed,
+                                          size=(H, W),
+                                          mode='nearest')
+            new_pos_embed = new_pos_embed.flatten(2)
+            new_pos_embed = new_pos_embed.transpose((0, 2, 1))
+            new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed),
+                                          axis=1)
+            x = paddle.add(x, new_pos_embed)
+        else:
+            x = paddle.add(x, self.pos_embed)
+
+        x = self.pos_drop(x)
+
+        # Time Embeddings
+        if self.attention_type != 'space_only':
+            cls_tokens = x[:B, 0, :].unsqueeze(1) if B > 0 else x.split(
+                T)[0].index_select(paddle.to_tensor([0]), axis=1)
+            x = x[:, 1:]
+            _, _n, _m = x.shape
+            _t = T
+            x = x.reshape([-1, _t, _n, _m]).transpose(
+                (0, 2, 1, 3)).reshape([-1, _t, _m])
+            # Resizing time embeddings in case they don't match
+            time_interp = (T != self.time_embed.shape[1])
+            if time_interp:  # T' != T
+                time_embed = self.time_embed.transpose((0, 2, 1)).unsqueeze(0)
+                new_time_embed = F.interpolate(time_embed,
+                                               size=(T, x.shape[-1]),
+                                               mode='nearest').squeeze(0)
+                new_time_embed = new_time_embed.transpose((0, 2, 1))
+                x = paddle.add(x, new_time_embed)
+            else:
+                x = paddle.add(x, self.time_embed)
+
+            x = self.time_drop(x)
+            _, _t, _m = x.shape
+            x = x.reshape([-1, W * W * T, _m])
+            x = paddle.concat((cls_tokens, x), axis=1)
+
+        # Attention blocks
+        for blk in self.blocks:
+            x = blk(x, B, T, W)
+
+        # Predictions for space-only baseline
+        if self.attention_type == 'space_only':
+            _, _n, _m = x.shape
+            _t = T
+            x = x.reshape([-1, _t, _n, _m])
+            x = paddle.mean(x, 1)  # averaging predictions for every frame
+
+        x = self.norm(x)
+        return x[:, 0]  # [B,  embed_dim]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/yowo.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/yowo.py
new file mode 100644
index 0000000..5e6b88d
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/yowo.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..registry import BACKBONES
+from .darknet import Darknet
+from .resnext101 import ResNext101
+import paddle.nn as nn
+import paddle
+
+
+class CAM_Module(nn.Layer):
+    def __init__(self, in_dim):
+        super(CAM_Module, self).__init__()
+        self.chanel_in = in_dim
+        temp = paddle.zeros([1], dtype='float32')
+        self.gamma = paddle.create_parameter(shape=temp.shape, dtype=str(temp.numpy().dtype),
+                                             default_initializer=paddle.nn.initializer.Assign(temp))
+        self.softmax = nn.Softmax(axis=-1)
+
+    def forward(self, x):
+        m_batchsize, C, height, width = x.shape
+        proj_query = paddle.reshape(x, [m_batchsize, C, -1])
+        proj_key = paddle.transpose(paddle.reshape(
+            x, [m_batchsize, C, -1]), perm=[0, 2, 1])
+        energy = paddle.bmm(proj_query, proj_key)
+        energy_new = paddle.expand_as(paddle.max(
+            energy, axis=-1, keepdim=True), energy) - energy
+        attention = self.softmax(energy_new)
+        proj_value = paddle.reshape(x, [m_batchsize, C, -1])
+
+        out = paddle.bmm(attention, proj_value)
+        out = out.reshape([m_batchsize, C, height, width])
+        out = self.gamma * out + x
+        return out
+
+
+class CFAMBlock(nn.Layer):
+    def __init__(self, in_channels, out_channels):
+        super(CFAMBlock, self).__init__()
+        inter_channels = 1024
+        self.conv_bn_relu1 = nn.Sequential(nn.Conv2D(in_channels, inter_channels, kernel_size=1, bias_attr=False),
+                                           nn.BatchNorm2D(inter_channels),
+                                           nn.ReLU())
+        self.conv_bn_relu2 = nn.Sequential(nn.Conv2D(inter_channels, inter_channels, 3, padding=1, bias_attr=False),
+                                           nn.BatchNorm2D(inter_channels),
+                                           nn.ReLU())
+
+        self.sc = CAM_Module(inter_channels)
+
+        self.conv_bn_relu3 = nn.Sequential(nn.Conv2D(inter_channels, inter_channels, 3, padding=1, bias_attr=False),
+                                           nn.BatchNorm2D(inter_channels),
+                                           nn.ReLU())
+        self.conv_out = nn.Sequential(nn.Dropout2D(0.1), nn.Conv2D(
+            inter_channels, out_channels, 1, bias_attr=True))
+
+    def forward(self, x):
+        x = self.conv_bn_relu1(x)
+        x = self.conv_bn_relu2(x)
+        x = self.sc(x)
+        x = self.conv_bn_relu3(x)
+        output = self.conv_out(x)
+
+        return output
+
+
+@BACKBONES.register()
+class YOWO(nn.Layer):
+    def __init__(self, num_class, pretrained_2d=None, pretrained_3d=None):
+        super(YOWO, self).__init__()
+
+        self.pretrained_2d = pretrained_2d
+        self.pretrained_3d = pretrained_3d
+        self.backbone_2d = Darknet()
+        self.backbone_3d = ResNext101()
+        self.num_ch_2d = 425
+        self.num_ch_3d = 2048
+        self.num_class = num_class
+        self.cfam = CFAMBlock(self.num_ch_2d + self.num_ch_3d, 1024)
+        self.conv_final = nn.Conv2D(
+            1024, 5 * (self.num_class + 4 + 1), kernel_size=1, bias_attr=False)
+        self.seen = 0
+
+    def init_weights(self):
+        if self.pretrained_2d is not None:
+            self.backbone_2d = self.load_pretrain_weight(
+                self.backbone_2d, self.pretrained_2d)
+        if self.pretrained_3d is not None:
+            self.backbone_3d = self.load_pretrain_weight(
+                self.backbone_3d, self.pretrained_3d)
+
+    def load_pretrain_weight(self, model, weights_path):
+        model_dict = model.state_dict()
+
+        param_state_dict = paddle.load(weights_path)
+        ignore_weights = set()
+
+        # hack: fit for faster rcnn. Pretrain weights contain prefix of 'backbone'
+        # while res5 module is located in bbox_head.head. Replace the prefix of
+        # res5 with 'bbox_head.head' to load pretrain weights correctly.
+        for k in list(param_state_dict.keys()):
+            if 'backbone.res5' in k:
+                new_k = k.replace('backbone', 'bbox_head.head')
+                if new_k in model_dict.keys():
+                    value = param_state_dict.pop(k)
+                    param_state_dict[new_k] = value
+
+        for name, weight in param_state_dict.items():
+            if name in model_dict.keys():
+                if list(weight.shape) != list(model_dict[name].shape):
+                    print(
+                        '{} not used, shape {} unmatched with {} in model.'.format(
+                            name, weight.shape, list(model_dict[name].shape)))
+                    ignore_weights.add(name)
+            else:
+                print('Redundant weight {} and ignore it.'.format(name))
+                ignore_weights.add(name)
+
+        for weight in ignore_weights:
+            param_state_dict.pop(weight, None)
+
+        model.set_dict(param_state_dict)
+        print('Finish loading model weights: {}'.format(weights_path))
+        return model
+
+    def forward(self, input):
+        x_3d = input  # Input clip
+        x_2d = input[:, :, -1, :, :]  # Last frame of the clip that is read
+
+        x_2d = self.backbone_2d(x_2d)
+
+        x_3d = self.backbone_3d(x_3d)
+
+        x_3d = paddle.squeeze(x_3d, axis=2)
+
+        x = paddle.concat([x_3d, x_2d], axis=1)
+        x = self.cfam(x)
+        out = self.conv_final(x)
+
+        return out
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/bbox_utils.py b/Bank_second_part/detect_process/paddlevideo/modeling/bbox_utils.py
new file mode 100644
index 0000000..23b4555
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/bbox_utils.py
@@ -0,0 +1,528 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn.functional as F
+import math
+import numpy as np
+
+
+def bbox2delta(src_boxes, tgt_boxes, weights):
+    src_w = src_boxes[:, 2] - src_boxes[:, 0]
+    src_h = src_boxes[:, 3] - src_boxes[:, 1]
+    src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
+    src_ctr_y = src_boxes[:, 1] + 0.5 * src_h
+
+    tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]
+    tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]
+    tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w
+    tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h
+
+    wx, wy, ww, wh = weights
+    dx = wx * (tgt_ctr_x - src_ctr_x) / src_w
+    dy = wy * (tgt_ctr_y - src_ctr_y) / src_h
+    dw = ww * paddle.log(tgt_w / src_w)
+    dh = wh * paddle.log(tgt_h / src_h)
+
+    deltas = paddle.stack((dx, dy, dw, dh), axis=1)
+    return deltas
+
+
+def delta2bbox(deltas, boxes, weights):
+    clip_scale = math.log(1000.0 / 16)
+
+    widths = boxes[:, 2] - boxes[:, 0]
+    heights = boxes[:, 3] - boxes[:, 1]
+    ctr_x = boxes[:, 0] + 0.5 * widths
+    ctr_y = boxes[:, 1] + 0.5 * heights
+
+    wx, wy, ww, wh = weights
+    dx = deltas[:, 0::4] / wx
+    dy = deltas[:, 1::4] / wy
+    dw = deltas[:, 2::4] / ww
+    dh = deltas[:, 3::4] / wh
+    # Prevent sending too large values into paddle.exp()
+    dw = paddle.clip(dw, max=clip_scale)
+    dh = paddle.clip(dh, max=clip_scale)
+
+    pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1)
+    pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1)
+    pred_w = paddle.exp(dw) * widths.unsqueeze(1)
+    pred_h = paddle.exp(dh) * heights.unsqueeze(1)
+
+    pred_boxes = []
+    pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
+    pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
+    pred_boxes = paddle.stack(pred_boxes, axis=-1)
+
+    return pred_boxes
+
+
+def expand_bbox(bboxes, scale):
+    w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5
+    h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5
+    x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5
+    y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5
+
+    w_half *= scale
+    h_half *= scale
+
+    bboxes_exp = np.zeros(bboxes.shape, dtype=np.float32)
+    bboxes_exp[:, 0] = x_c - w_half
+    bboxes_exp[:, 2] = x_c + w_half
+    bboxes_exp[:, 1] = y_c - h_half
+    bboxes_exp[:, 3] = y_c + h_half
+
+    return bboxes_exp
+
+
+def clip_bbox(boxes, im_shape):
+    h, w = im_shape[0], im_shape[1]
+    x1 = boxes[:, 0].clip(0, w)
+    y1 = boxes[:, 1].clip(0, h)
+    x2 = boxes[:, 2].clip(0, w)
+    y2 = boxes[:, 3].clip(0, h)
+    return paddle.stack([x1, y1, x2, y2], axis=1)
+
+
+def nonempty_bbox(boxes, min_size=0, return_mask=False):
+    w = boxes[:, 2] - boxes[:, 0]
+    h = boxes[:, 3] - boxes[:, 1]
+    mask = paddle.logical_and(w > min_size, w > min_size)
+    if return_mask:
+        return mask
+    keep = paddle.nonzero(mask).flatten()
+    return keep
+
+
+def bbox_area(boxes):
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+def bbox_overlaps(boxes1, boxes2):
+    """
+    Calculate overlaps between boxes1 and boxes2
+
+    Args:
+        boxes1 (Tensor): boxes with shape [M, 4]
+        boxes2 (Tensor): boxes with shape [N, 4]
+
+    Return:
+        overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N]
+    """
+    area1 = bbox_area(boxes1)
+    area2 = bbox_area(boxes2)
+
+    xy_max = paddle.minimum(
+        paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:])
+    xy_min = paddle.maximum(
+        paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2])
+    width_height = xy_max - xy_min
+    width_height = width_height.clip(min=0)
+    inter = width_height.prod(axis=2)
+
+    overlaps = paddle.where(inter > 0, inter /
+                            (paddle.unsqueeze(area1, 1) + area2 - inter),
+                            paddle.zeros_like(inter))
+    return overlaps
+
+
+def xywh2xyxy(box):
+    x, y, w, h = box
+    x1 = x - w * 0.5
+    y1 = y - h * 0.5
+    x2 = x + w * 0.5
+    y2 = y + h * 0.5
+    return [x1, y1, x2, y2]
+
+
+def make_grid(h, w, dtype):
+    yv, xv = paddle.meshgrid([paddle.arange(h), paddle.arange(w)])
+    return paddle.stack((xv, yv), 2).cast(dtype=dtype)
+
+
+def decode_yolo(box, anchor, downsample_ratio):
+    """decode yolo box
+
+    Args:
+        box (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
+        anchor (list): anchor with the shape [na, 2]
+        downsample_ratio (int): downsample ratio, default 32
+        scale (float): scale, default 1.
+
+    Return:
+        box (list): decoded box, [x, y, w, h], all have the shape [b, na, h, w, 1]
+    """
+    x, y, w, h = box
+    na, grid_h, grid_w = x.shape[1:4]
+    grid = make_grid(grid_h, grid_w, x.dtype).reshape((1, 1, grid_h, grid_w, 2))
+    x1 = (x + grid[:, :, :, :, 0:1]) / grid_w
+    y1 = (y + grid[:, :, :, :, 1:2]) / grid_h
+
+    anchor = paddle.to_tensor(anchor)
+    anchor = paddle.cast(anchor, x.dtype)
+    anchor = anchor.reshape((1, na, 1, 1, 2))
+    w1 = paddle.exp(w) * anchor[:, :, :, :, 0:1] / (downsample_ratio * grid_w)
+    h1 = paddle.exp(h) * anchor[:, :, :, :, 1:2] / (downsample_ratio * grid_h)
+
+    return [x1, y1, w1, h1]
+
+
+def iou_similarity(box1, box2, eps=1e-9):
+    """Calculate iou of box1 and box2
+
+    Args:
+        box1 (Tensor): box with the shape [N, M1, 4]
+        box2 (Tensor): box with the shape [N, M2, 4]
+
+    Return:
+        iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2]
+    """
+    box1 = box1.unsqueeze(2)  # [N, M1, 4] -> [N, M1, 1, 4]
+    box2 = box2.unsqueeze(1)  # [N, M2, 4] -> [N, 1, M2, 4]
+    px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4]
+    gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4]
+    x1y1 = paddle.maximum(px1y1, gx1y1)
+    x2y2 = paddle.minimum(px2y2, gx2y2)
+    overlap = (x2y2 - x1y1).clip(0).prod(-1)
+    area1 = (px2y2 - px1y1).clip(0).prod(-1)
+    area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
+    union = area1 + area2 - overlap + eps
+    return overlap / union
+
+
+def bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9):
+    """calculate the iou of box1 and box2
+
+    Args:
+        box1 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
+        box2 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
+        giou (bool): whether use giou or not, default False
+        diou (bool): whether use diou or not, default False
+        ciou (bool): whether use ciou or not, default False
+        eps (float): epsilon to avoid divide by zero
+
+    Return:
+        iou (Tensor): iou of box1 and box1, with the shape [b, na, h, w, 1]
+    """
+    px1, py1, px2, py2 = box1
+    gx1, gy1, gx2, gy2 = box2
+    x1 = paddle.maximum(px1, gx1)
+    y1 = paddle.maximum(py1, gy1)
+    x2 = paddle.minimum(px2, gx2)
+    y2 = paddle.minimum(py2, gy2)
+
+    overlap = ((x2 - x1).clip(0)) * ((y2 - y1).clip(0))
+
+    area1 = (px2 - px1) * (py2 - py1)
+    area1 = area1.clip(0)
+
+    area2 = (gx2 - gx1) * (gy2 - gy1)
+    area2 = area2.clip(0)
+
+    union = area1 + area2 - overlap + eps
+    iou = overlap / union
+
+    if giou or ciou or diou:
+        # convex w, h
+        cw = paddle.maximum(px2, gx2) - paddle.minimum(px1, gx1)
+        ch = paddle.maximum(py2, gy2) - paddle.minimum(py1, gy1)
+        if giou:
+            c_area = cw * ch + eps
+            return iou - (c_area - union) / c_area
+        else:
+            # convex diagonal squared
+            c2 = cw**2 + ch**2 + eps
+            # center distance
+            rho2 = ((px1 + px2 - gx1 - gx2)**2 + (py1 + py2 - gy1 - gy2)**2) / 4
+            if diou:
+                return iou - rho2 / c2
+            else:
+                w1, h1 = px2 - px1, py2 - py1 + eps
+                w2, h2 = gx2 - gx1, gy2 - gy1 + eps
+                delta = paddle.atan(w1 / h1) - paddle.atan(w2 / h2)
+                v = (4 / math.pi**2) * paddle.pow(delta, 2)
+                alpha = v / (1 + eps - iou + v)
+                alpha.stop_gradient = True
+                return iou - (rho2 / c2 + v * alpha)
+    else:
+        return iou
+
+
+def rect2rbox(bboxes):
+    """
+    :param bboxes: shape (n, 4) (xmin, ymin, xmax, ymax)
+    :return: dbboxes: shape (n, 5) (x_ctr, y_ctr, w, h, angle)
+    """
+    bboxes = bboxes.reshape(-1, 4)
+    num_boxes = bboxes.shape[0]
+
+    x_ctr = (bboxes[:, 2] + bboxes[:, 0]) / 2.0
+    y_ctr = (bboxes[:, 3] + bboxes[:, 1]) / 2.0
+    edges1 = np.abs(bboxes[:, 2] - bboxes[:, 0])
+    edges2 = np.abs(bboxes[:, 3] - bboxes[:, 1])
+    angles = np.zeros([num_boxes], dtype=bboxes.dtype)
+
+    inds = edges1 < edges2
+
+    rboxes = np.stack((x_ctr, y_ctr, edges1, edges2, angles), axis=1)
+    rboxes[inds, 2] = edges2[inds]
+    rboxes[inds, 3] = edges1[inds]
+    rboxes[inds, 4] = np.pi / 2.0
+    return rboxes
+
+
+def delta2rbox(Rrois,
+               deltas,
+               means=[0, 0, 0, 0, 0],
+               stds=[1, 1, 1, 1, 1],
+               wh_ratio_clip=1e-6):
+    """
+    :param Rrois: (cx, cy, w, h, theta)
+    :param deltas: (dx, dy, dw, dh, dtheta)
+    :param means:
+    :param stds:
+    :param wh_ratio_clip:
+    :return:
+    """
+    means = paddle.to_tensor(means)
+    stds = paddle.to_tensor(stds)
+    deltas = paddle.reshape(deltas, [-1, deltas.shape[-1]])
+    denorm_deltas = deltas * stds + means
+
+    dx = denorm_deltas[:, 0]
+    dy = denorm_deltas[:, 1]
+    dw = denorm_deltas[:, 2]
+    dh = denorm_deltas[:, 3]
+    dangle = denorm_deltas[:, 4]
+
+    max_ratio = np.abs(np.log(wh_ratio_clip))
+    dw = paddle.clip(dw, min=-max_ratio, max=max_ratio)
+    dh = paddle.clip(dh, min=-max_ratio, max=max_ratio)
+
+    Rroi_x = Rrois[:, 0]
+    Rroi_y = Rrois[:, 1]
+    Rroi_w = Rrois[:, 2]
+    Rroi_h = Rrois[:, 3]
+    Rroi_angle = Rrois[:, 4]
+
+    gx = dx * Rroi_w * paddle.cos(Rroi_angle) - dy * Rroi_h * paddle.sin(
+        Rroi_angle) + Rroi_x
+    gy = dx * Rroi_w * paddle.sin(Rroi_angle) + dy * Rroi_h * paddle.cos(
+        Rroi_angle) + Rroi_y
+    gw = Rroi_w * dw.exp()
+    gh = Rroi_h * dh.exp()
+    ga = np.pi * dangle + Rroi_angle
+    ga = (ga + np.pi / 4) % np.pi - np.pi / 4
+    ga = paddle.to_tensor(ga)
+
+    gw = paddle.to_tensor(gw, dtype='float32')
+    gh = paddle.to_tensor(gh, dtype='float32')
+    bboxes = paddle.stack([gx, gy, gw, gh, ga], axis=-1)
+    return bboxes
+
+
+def rbox2delta(proposals, gt, means=[0, 0, 0, 0, 0], stds=[1, 1, 1, 1, 1]):
+    """
+
+    Args:
+        proposals:
+        gt:
+        means: 1x5
+        stds: 1x5
+
+    Returns:
+
+    """
+    proposals = proposals.astype(np.float64)
+
+    PI = np.pi
+
+    gt_widths = gt[..., 2]
+    gt_heights = gt[..., 3]
+    gt_angle = gt[..., 4]
+
+    proposals_widths = proposals[..., 2]
+    proposals_heights = proposals[..., 3]
+    proposals_angle = proposals[..., 4]
+
+    coord = gt[..., 0:2] - proposals[..., 0:2]
+    dx = (np.cos(proposals[..., 4]) * coord[..., 0] + np.sin(proposals[..., 4])
+          * coord[..., 1]) / proposals_widths
+    dy = (-np.sin(proposals[..., 4]) * coord[..., 0] + np.cos(proposals[..., 4])
+          * coord[..., 1]) / proposals_heights
+    dw = np.log(gt_widths / proposals_widths)
+    dh = np.log(gt_heights / proposals_heights)
+    da = (gt_angle - proposals_angle)
+
+    da = (da + PI / 4) % PI - PI / 4
+    da /= PI
+
+    deltas = np.stack([dx, dy, dw, dh, da], axis=-1)
+    means = np.array(means, dtype=deltas.dtype)
+    stds = np.array(stds, dtype=deltas.dtype)
+    deltas = (deltas - means) / stds
+    deltas = deltas.astype(np.float32)
+    return deltas
+
+
+def bbox_decode(bbox_preds,
+                anchors,
+                means=[0, 0, 0, 0, 0],
+                stds=[1, 1, 1, 1, 1]):
+    """decode bbox from deltas
+    Args:
+        bbox_preds: [N,H,W,5]
+        anchors: [H*W,5]
+    return:
+        bboxes: [N,H,W,5]
+    """
+    means = paddle.to_tensor(means)
+    stds = paddle.to_tensor(stds)
+    num_imgs, H, W, _ = bbox_preds.shape
+    bboxes_list = []
+    for img_id in range(num_imgs):
+        bbox_pred = bbox_preds[img_id]
+        # bbox_pred.shape=[5,H,W]
+        bbox_delta = bbox_pred
+        anchors = paddle.to_tensor(anchors)
+        bboxes = delta2rbox(
+            anchors, bbox_delta, means, stds, wh_ratio_clip=1e-6)
+        bboxes = paddle.reshape(bboxes, [H, W, 5])
+        bboxes_list.append(bboxes)
+    return paddle.stack(bboxes_list, axis=0)
+
+
+def poly_to_rbox(polys):
+    """
+    poly:[x0,y0,x1,y1,x2,y2,x3,y3]
+    to
+    rotated_boxes:[x_ctr,y_ctr,w,h,angle]
+    """
+    rotated_boxes = []
+    for poly in polys:
+        poly = np.array(poly[:8], dtype=np.float32)
+
+        pt1 = (poly[0], poly[1])
+        pt2 = (poly[2], poly[3])
+        pt3 = (poly[4], poly[5])
+        pt4 = (poly[6], poly[7])
+
+        edge1 = np.sqrt((pt1[0] - pt2[0]) * (pt1[0] - pt2[0]) + (pt1[1] - pt2[
+            1]) * (pt1[1] - pt2[1]))
+        edge2 = np.sqrt((pt2[0] - pt3[0]) * (pt2[0] - pt3[0]) + (pt2[1] - pt3[
+            1]) * (pt2[1] - pt3[1]))
+
+        width = max(edge1, edge2)
+        height = min(edge1, edge2)
+
+        rbox_angle = 0
+        if edge1 > edge2:
+            rbox_angle = np.arctan2(
+                np.float(pt2[1] - pt1[1]), np.float(pt2[0] - pt1[0]))
+        elif edge2 >= edge1:
+            rbox_angle = np.arctan2(
+                np.float(pt4[1] - pt1[1]), np.float(pt4[0] - pt1[0]))
+
+        def norm_angle(angle, range=[-np.pi / 4, np.pi]):
+            return (angle - range[0]) % range[1] + range[0]
+
+        rbox_angle = norm_angle(rbox_angle)
+
+        x_ctr = np.float(pt1[0] + pt3[0]) / 2
+        y_ctr = np.float(pt1[1] + pt3[1]) / 2
+        rotated_box = np.array([x_ctr, y_ctr, width, height, rbox_angle])
+        rotated_boxes.append(rotated_box)
+    ret_rotated_boxes = np.array(rotated_boxes)
+    assert ret_rotated_boxes.shape[1] == 5
+    return ret_rotated_boxes
+
+
+def cal_line_length(point1, point2):
+    import math
+    return math.sqrt(
+        math.pow(point1[0] - point2[0], 2) + math.pow(point1[1] - point2[1], 2))
+
+
+def get_best_begin_point_single(coordinate):
+    x1, y1, x2, y2, x3, y3, x4, y4 = coordinate
+    xmin = min(x1, x2, x3, x4)
+    ymin = min(y1, y2, y3, y4)
+    xmax = max(x1, x2, x3, x4)
+    ymax = max(y1, y2, y3, y4)
+    combinate = [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]],
+                 [[x4, y4], [x1, y1], [x2, y2], [x3, y3]],
+                 [[x3, y3], [x4, y4], [x1, y1], [x2, y2]],
+                 [[x2, y2], [x3, y3], [x4, y4], [x1, y1]]]
+    dst_coordinate = [[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax]]
+    force = 100000000.0
+    force_flag = 0
+    for i in range(4):
+        temp_force = cal_line_length(combinate[i][0], dst_coordinate[0]) \
+                     + cal_line_length(combinate[i][1], dst_coordinate[1]) \
+                     + cal_line_length(combinate[i][2], dst_coordinate[2]) \
+                     + cal_line_length(combinate[i][3], dst_coordinate[3])
+        if temp_force < force:
+            force = temp_force
+            force_flag = i
+    if force_flag != 0:
+        pass
+    return np.array(combinate[force_flag]).reshape(8)
+
+
+def rbox2poly_single(rrect):
+    """
+    rrect:[x_ctr,y_ctr,w,h,angle]
+    to
+    poly:[x0,y0,x1,y1,x2,y2,x3,y3]
+    """
+    x_ctr, y_ctr, width, height, angle = rrect[:5]
+    tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2
+    # rect 2x4
+    rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])
+    R = np.array([[np.cos(angle), -np.sin(angle)],
+                  [np.sin(angle), np.cos(angle)]])
+    # poly
+    poly = R.dot(rect)
+    x0, x1, x2, x3 = poly[0, :4] + x_ctr
+    y0, y1, y2, y3 = poly[1, :4] + y_ctr
+    poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32)
+    poly = get_best_begin_point_single(poly)
+    return poly
+
+
+def rbox2poly(rrects):
+    """
+    rrect:[x_ctr,y_ctr,w,h,angle]
+    to
+    poly:[x0,y0,x1,y1,x2,y2,x3,y3]
+    """
+    polys = []
+    for rrect in rrects:
+        x_ctr, y_ctr, width, height, angle = rrect[:5]
+        tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2
+        rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])
+        R = np.array([[np.cos(angle), -np.sin(angle)],
+                      [np.sin(angle), np.cos(angle)]])
+        poly = R.dot(rect)
+        x0, x1, x2, x3 = poly[0, :4] + x_ctr
+        y0, y1, y2, y3 = poly[1, :4] + y_ctr
+        poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32)
+        poly = get_best_begin_point_single(poly)
+        polys.append(poly)
+    polys = np.array(polys)
+    return polys
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/builder.py b/Bank_second_part/detect_process/paddlevideo/modeling/builder.py
new file mode 100644
index 0000000..71503eb
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/builder.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .registry import BACKBONES, HEADS, LOSSES, RECOGNIZERS, LOCALIZERS, ROI_EXTRACTORS, DETECTORS, BBOX_ASSIGNERS, BBOX_SAMPLERS, BBOX_CODERS, PARTITIONERS, MULTIMODAL, SEGMENT, SEGMENTERS
+from ..utils import build
+from .registry import (BACKBONES, BBOX_ASSIGNERS, BBOX_CODERS, BBOX_SAMPLERS,
+                       DETECTORS, ESTIMATORS, HEADS, LOCALIZERS, LOSSES,
+                       MULTIMODAL, PARTITIONERS, RECOGNIZERS, ROI_EXTRACTORS)
+
+
+def build_backbone(cfg):
+    """Build backbone."""
+    return build(cfg, BACKBONES)
+
+
+def build_roi_extractor(cfg):
+    """Build roi extractor."""
+    return build(cfg, ROI_EXTRACTORS)
+
+
+def build_assigner(cfg, **default_args):
+    """Builder of box assigner."""
+    return build(cfg, BBOX_ASSIGNERS)
+
+
+def build_sampler(cfg, **default_args):
+    """Builder of box sampler."""
+    return build(cfg, BBOX_SAMPLERS)
+
+
+def build_roi_extractor(cfg):
+    """Build roi extractor."""
+    return build(cfg, ROI_EXTRACTORS)
+
+
+def build_assigner(cfg, **default_args):
+    """Builder of box assigner."""
+    return build(cfg, BBOX_ASSIGNERS)
+
+
+def build_sampler(cfg, **default_args):
+    """Builder of box sampler."""
+    return build(cfg, BBOX_SAMPLERS)
+
+
+def build_head(cfg):
+    """Build head."""
+    return build(cfg, HEADS)
+
+
+def build_loss(cfg):
+    """Build loss."""
+    return build(cfg, LOSSES)
+
+
+def build_recognizer(cfg):
+    """Build recognizer."""
+    return build(cfg, RECOGNIZERS, key='framework')
+
+
+def build_segmenter(cfg):
+    """Build segmenter."""
+    return build(cfg, SEGMENTERS, key='framework')
+
+
+def build_localizer(cfg):
+    """Build localizer."""
+    return build(cfg, LOCALIZERS, key='framework')
+
+
+def build_detector(cfg, train_cfg=None, test_cfg=None):
+    """Build detector."""
+    return build(cfg, DETECTORS, key='framework')
+
+
+def build_partitioner(cfg):
+    """Build partitioner."""
+    return build(cfg, PARTITIONERS, key='framework')
+
+
+def build_estimator(cfg):
+    """Build estimator."""
+    return build(cfg, ESTIMATORS, key='framework')
+
+
+def build_multimodal(cfg):
+    """Build multimodal."""
+    return build(cfg, MULTIMODAL, key='framework')
+
+
+def build_segment(cfg):
+    """Build segment."""
+    return build(cfg, SEGMENT, key='framework')
+
+
+def build_model(cfg):
+    cfg_copy = cfg.copy()
+    framework_type = cfg_copy.get('framework')
+    if framework_type in RECOGNIZERS:
+        return build_recognizer(cfg)
+    elif framework_type in LOCALIZERS:
+        return build_localizer(cfg)
+    elif framework_type in PARTITIONERS:
+        return build_partitioner(cfg)
+    elif framework_type in DETECTORS:
+        return build_detector(cfg)
+    elif framework_type in ESTIMATORS:
+        return build_estimator(cfg)
+    elif framework_type in MULTIMODAL:
+        return build_multimodal(cfg)
+    elif framework_type in SEGMENTERS:
+        return build_segmenter(cfg)
+    elif framework_type in SEGMENT:
+        return build_segment(cfg)
+    else:
+        raise NotImplementedError
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/__init__.py
new file mode 100644
index 0000000..d68fe09
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/__init__.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .estimators import BaseEstimator, DepthEstimator
+from .localizers import BaseLocalizer, BMNLocalizer
+from .partitioners import BasePartitioner, TransNetV2Partitioner
+from .recognizers import BaseRecognizer, Recognizer2D
+from .multimodal import ActBert, BaseMultimodal
+from .segment import BaseSegment, CFBI
+from .segmenters import MSTCN
+
+__all__ = [
+    'BaseRecognizer', 'Recognizer2D', 'BaseLocalizer', 'BMNLocalizer',
+    'BasePartitioner', 'TransNetV2Partitioner', 'BaseEstimator',
+    'DepthEstimator', 'BaseMultimodal', 'ActBert', 'BaseSegment', 'CFBI',
+    'MSTCN'
+]
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..4d7bf2b
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__init__.py
new file mode 100644
index 0000000..74dcac0
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from .base import BaseDetector
+from .fast_rcnn import FastRCNN
+from .two_stage import TwoStageDetector
+
+__all__ = ['BaseDetector', 'TwoStageDetector', 'FastRCNN']
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..bdf6421
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000..b61c7f1
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/base.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/fast_rcnn.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/fast_rcnn.cpython-310.pyc
new file mode 100644
index 0000000..e7704f4
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/fast_rcnn.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/two_stage.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/two_stage.cpython-310.pyc
new file mode 100644
index 0000000..176db47
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/two_stage.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/base.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/base.py
new file mode 100644
index 0000000..4d5ccb8
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/base.py
@@ -0,0 +1,51 @@
+from abc import abstractmethod
+from ... import builder
+import paddle.nn as nn
+from ...registry import DETECTORS
+
+@DETECTORS.register()
+class BaseDetector(nn.Layer):
+    """Base class for detectors.  """
+    def __init__(self, backbone=None, head=None):
+
+        super().__init__()
+
+    def init_weights(self):
+        """Initialize the model network weights. """
+        self.backbone.init_weights()  
+        self.head.init_weights()
+
+    def extract_feature(self, imgs, iter_num):
+        """Extract features through a backbone.  """
+        feature = self.backbone(imgs)
+        return feature
+
+    def forward(self,  data_batch, mode='infer'):
+        if mode == 'train':
+            return self.train_step(data_batch)
+        elif mode == 'valid':
+            return self.val_step(data_batch)
+        elif mode == 'test':
+            return self.test_step(data_batch)
+        elif mode == 'infer':
+            return self.infer_step(data_batch)
+        else:
+            raise NotImplementedError
+
+    @abstractmethod
+    def train_step(self, data_batch, **kwargs):
+        """Training step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def val_step(self, data_batch, **kwargs):
+        """Validating step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def test_step(self, data_batch, **kwargs):
+        """Test step.
+        """
+        raise NotImplementedError
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/fast_rcnn.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/fast_rcnn.py
new file mode 100644
index 0000000..e8f912d
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/fast_rcnn.py
@@ -0,0 +1,34 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .two_stage import TwoStageDetector
+from ...registry import DETECTORS
+
+@DETECTORS.register()
+class FastRCNN(TwoStageDetector):
+
+    def __init__(self,
+                 backbone,
+                 head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 neck=None,
+                 pretrained=None):
+        super(FastRCNN, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            roi_head=head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained)
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/two_stage.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/two_stage.py
new file mode 100644
index 0000000..f9deb1d
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/two_stage.py
@@ -0,0 +1,186 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from ... import builder
+import paddle.distributed as dist
+from ...registry import DETECTORS
+from .base import BaseDetector
+
+
+@DETECTORS.register()
+class TwoStageDetector(BaseDetector):
+    """Base class for two-stage detectors.  """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 rpn_head=None,
+                 roi_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(TwoStageDetector, self).__init__()
+        self.backbone = builder.build_backbone(backbone)
+
+        if neck is not None:
+            self.neck = neck  # useless
+
+        if rpn_head is not None:
+            rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None
+            rpn_head_ = rpn_head.copy()
+            rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn)
+            self.rpn_head = builder.build_head(rpn_head_)
+
+        if roi_head is not None:
+            self.roi_head = builder.build_head(roi_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if pretrained is not None:
+            self.init_weights(pretrained=pretrained)
+
+    @property
+    def with_rpn(self):
+        """whether the detector has RPN"""
+        return hasattr(self, 'rpn_head') and self.rpn_head is not None
+
+    @property
+    def with_roi_head(self):
+        """whether the detector has a RoI head"""
+        return hasattr(self, 'roi_head') and self.roi_head is not None
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in detector.  """
+        super(TwoStageDetector, self).init_weights(pretrained)
+        self.backbone.init_weights(pretrained=pretrained)
+        if self.with_rpn:
+            self.rpn_head.init_weights()
+        if self.with_roi_head:
+            self.roi_head.init_weights(pretrained)
+
+    def extract_feat(self, img):
+        """Directly extract features from the backbone."""
+        x = self.backbone(img)
+        return x
+
+    def train_step(self, data, **kwargs):
+        img_slow = data[0]
+        img_fast = data[1]
+        proposals, gt_bboxes, gt_labels, scores, entity_ids = self.get_unpad_datas(
+            data)
+        img_shape = data[7]
+        img_idx = data[8]
+        img_metas = scores, entity_ids
+        x = self.extract_feat(img=[img_slow, img_fast])
+        roi_losses = self.roi_head.train_step(x, img_metas, proposals,
+                                              gt_bboxes, gt_labels, **kwargs)
+        losses = dict()
+        losses.update(roi_losses)
+
+        return losses
+
+    def val_step(self, data, rescale=False):
+        img_slow = data[0]
+        img_fast = data[1]
+        proposals, gt_bboxes, gt_labels, scores, entity_ids = self.get_unpad_datas(
+            data)
+        img_shape = data[7]
+        img_metas = scores, entity_ids
+        x = self.extract_feat(img=[img_slow, img_fast])
+
+        return self.roi_head.simple_test(x,
+                                         proposals[0],
+                                         img_shape,
+                                         rescale=rescale)
+
+    def test_step(self, data, rescale=False):
+        return self.val_step(data, rescale)
+
+    def infer_step(self, data, rescale=False):
+        ''' model inference'''
+
+        img_slow = data[0]
+        img_fast = data[1]
+        proposals = data[2]
+        img_shape = data[3]
+
+        # using slowfast model to extract spatio-temporal features
+        x = self.extract_feat(img=[img_slow, img_fast])
+
+        ret = self.roi_head.simple_test(x,
+                                        proposals[0],
+                                        img_shape,
+                                        rescale=rescale)
+        return ret
+
+    def get_unpad_datas(self, data):
+        ''' get original datas padded in dataset '''
+        pad_proposals = data[2]
+        pad_gt_bboxes = data[3]
+        pad_gt_labels = data[4]
+        pad_scores, pad_entity_ids = data[5], data[6]
+        len_proposals = data[9]
+        len_gt_bboxes = data[10]
+        len_gt_labels = data[11]
+        len_scores = data[12]
+        len_entity_ids = data[13]
+        N = pad_proposals.shape[0]
+        proposals = []
+        gt_bboxes = []
+        gt_labels = []
+        scores = []
+        entity_ids = []
+        for bi in range(N):
+            pad_proposal = pad_proposals[bi]
+            len_proposal = len_proposals[bi]
+            index_proposal = paddle.arange(len_proposal)
+            proposal = paddle.index_select(x=pad_proposal,
+                                           index=index_proposal,
+                                           axis=0)
+            proposals.append(proposal)
+
+            pad_gt_bbox = pad_gt_bboxes[bi]
+            len_gt_bbox = len_gt_bboxes[bi]
+            index_gt_bbox = paddle.arange(len_gt_bbox)
+            gt_bbox = paddle.index_select(x=pad_gt_bbox,
+                                          index=index_gt_bbox,
+                                          axis=0)
+            gt_bboxes.append(gt_bbox)
+
+            pad_gt_label = pad_gt_labels[bi]
+            len_gt_label = len_gt_labels[bi]
+            index_gt_label = paddle.arange(len_gt_label)
+            gt_label = paddle.index_select(x=pad_gt_label,
+                                           index=index_gt_label,
+                                           axis=0)
+            gt_labels.append(gt_label)
+
+            pad_score = pad_scores[bi]
+            len_score = len_scores[bi]
+            index_score = paddle.arange(len_score)
+            score = paddle.index_select(x=pad_score, index=index_score, axis=0)
+            scores.append(score)
+
+            pad_entity_id = pad_entity_ids[bi]
+            len_entity_id = len_entity_ids[bi]
+            index_entity_id = paddle.arange(len_entity_id)
+            entity_id = paddle.index_select(x=pad_entity_id,
+                                            index=index_entity_id,
+                                            axis=0)
+            entity_ids.append(entity_id)
+
+        return proposals, gt_bboxes, gt_labels, scores, entity_ids
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__init__.py
new file mode 100644
index 0000000..e2bda93
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__init__.py
@@ -0,0 +1,4 @@
+from .base import BaseEstimator
+from .depth_estimator import DepthEstimator
+
+__all__ = ['DepthEstimator', 'BaseEstimator']
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..72d1ff9
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000..8bee686
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__pycache__/base.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__pycache__/depth_estimator.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__pycache__/depth_estimator.cpython-310.pyc
new file mode 100644
index 0000000..7ad977b
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__pycache__/depth_estimator.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/base.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/base.py
new file mode 100644
index 0000000..cdddd67
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/base.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from abc import abstractmethod
+
+import paddle
+import paddle.nn as nn
+from paddlevideo.modeling.registry import ESTIMATORS
+from paddlevideo.utils import get_logger
+
+from ... import builder
+
+logger = get_logger("paddlevideo")
+
+
+@ESTIMATORS.register()
+class BaseEstimator(nn.Layer):
+    """BaseEstimator
+
+    """
+    def __init__(self, backbone=None, head=None):
+        super().__init__()
+        if backbone is not None:
+            self.backbone = builder.build_backbone(backbone)
+            if hasattr(self.backbone, 'init_weights'):
+                self.backbone.init_weights()
+        else:
+            self.backbone = None
+
+        if head is not None:
+            self.head_name = head.name
+            self.head = builder.build_head(head)
+            if hasattr(self.head, 'init_weights'):
+                self.head.init_weights()
+        else:
+            self.head = None
+
+    def forward(self, data_batch, mode='infer'):
+        """
+        1. Define how the model is going to run, from input to output.
+        2. Console of train, valid, test or infer step
+        """
+        if mode == 'train':
+            return self.train_step(data_batch)
+        elif mode == 'valid':
+            return self.val_step(data_batch)
+        elif mode == 'test':
+            return self.test_step(data_batch)
+        elif mode == 'infer':
+            return self.infer_step(data_batch)
+        else:
+            raise NotImplementedError
+
+    @abstractmethod
+    def train_step(self, data_batch):
+        """Define how the model is going to train, from input to output.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def val_step(self, data_batch):
+        """Define how the model is going to valid, from input to output."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def test_step(self, data_batch):
+        """Define how the model is going to test, from input to output."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def infer_step(self, data_batch):
+        """Define how the model is going to infer, from input to output."""
+        raise NotImplementedError
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/depth_estimator.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/depth_estimator.py
new file mode 100644
index 0000000..13ee877
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/depth_estimator.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import paddle
+from paddlevideo.modeling.framework.estimators.base import BaseEstimator
+from paddlevideo.modeling.registry import ESTIMATORS
+from paddlevideo.utils import get_logger
+
+from ... import builder
+
+logger = get_logger("paddlevideo")
+
+
+@ESTIMATORS.register()
+class DepthEstimator(BaseEstimator):
+    """DepthEstimator
+    """
+    def forward_net(self, inputs, day_or_night='day_and_night'):
+        if self.backbone is not None:
+            outputs = self.backbone(inputs, day_or_night)
+        else:
+            outputs = inputs
+        return outputs
+
+    def train_step(self, data_batch):
+        """Define how the model is going to train, from input to output.
+        """
+        inputs, _ = data_batch
+        outputs = self.forward_net(inputs, day_or_night='day_and_night')
+        loss_metrics = self.head.loss(inputs, outputs)
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        inputs, day_or_night = data_batch
+        outputs = self.forward_net(inputs, day_or_night=day_or_night)
+        loss_metrics = self.head.loss(inputs, outputs)
+        return loss_metrics
+
+    def test_step(self, data_batch):
+        """Define how the model is going to test, from input to output."""
+        inputs, day_or_night = data_batch
+        outputs = self.forward_net(inputs, day_or_night=day_or_night)
+        loss_metrics = self.head.loss(inputs, outputs)
+        return loss_metrics
+
+    def infer_step(self, data_batch):
+        """Define how the model is going to infer, from input to output."""
+        inputs = data_batch[0]
+        outputs = self.forward_net(inputs, day_or_night='day')
+        return outputs
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__init__.py
new file mode 100644
index 0000000..323a72c
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__init__.py
@@ -0,0 +1,19 @@
+# copyright (c) 2020  paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license"
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+from .base import BaseLocalizer
+from .bmn_localizer import BMNLocalizer
+from .yowo_localizer import YOWOLocalizer
+
+__all__ = ['BaseLocalizer', 'BMNLocalizer', 'YOWOLocalizer']
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..4e79cd8
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000..f7e5cef
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/base.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/bmn_localizer.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/bmn_localizer.cpython-310.pyc
new file mode 100644
index 0000000..3fdc87d
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/bmn_localizer.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/yowo_localizer.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/yowo_localizer.cpython-310.pyc
new file mode 100644
index 0000000..f3c15db
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/yowo_localizer.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/yowo_utils.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/yowo_utils.cpython-310.pyc
new file mode 100644
index 0000000..15bd20f
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/yowo_utils.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/base.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/base.py
new file mode 100644
index 0000000..cfd2869
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/base.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+import paddle.nn as nn
+from ... import builder
+
+
+class BaseLocalizer(nn.Layer):
+    """Base class for Localization.
+    All localizer should subclass it.
+    All subclass should overwrite:
+    - Methods:``train_step``, define your train step.
+    - Methods:``valid_step``, define your valid step, always the same as train_step.
+    - Methods:``test_step``, define your test step.
+    """
+    def __init__(self, backbone, loss):
+        super().__init__()
+        self.backbone = builder.build_backbone(backbone)
+        self.loss = builder.build_loss(loss)
+        self.init_weights()
+
+    def init_weights(self):
+        """Initialize the model network weights. """
+        if getattr(self.backbone, 'init_weights'):
+            self.backbone.init_weights()
+        else:
+            pass
+
+    def forward(self, data_batch, mode='infer'):
+        """
+        1. Define how the model is going to run, from input to output.
+        2. Console of train, valid, test or infer step
+        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py
+        """
+        if mode == 'train':
+            return self.train_step(data_batch)
+        elif mode == 'valid':
+            return self.val_step(data_batch)
+        elif mode == 'test':
+            return self.test_step(data_batch)
+        elif mode == 'infer':
+            return self.infer_step(data_batch)
+        else:
+            raise NotImplementedError
+
+    @abstractmethod
+    def train_step(self, data_batch, **kwargs):
+        """Training step.  input_data_batch -> loss_metric
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def val_step(self, data_batch, **kwargs):
+        """Validating setp. input_data_batch -> loss_metric
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def test_step(self, data_batch, **kwargs):
+        """Tets setp. to get acc in test data. input_data_batch -> output
+        """
+        raise NotImplementedError
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/bmn_localizer.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/bmn_localizer.py
new file mode 100644
index 0000000..5afbd3a
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/bmn_localizer.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import LOCALIZERS
+from .base import BaseLocalizer
+
+import paddle
+
+
+@LOCALIZERS.register()
+class BMNLocalizer(BaseLocalizer):
+    """BMN Localization framework
+    """
+    def forward_net(self, imgs):
+        """Call backbone forward.
+        """
+        preds = self.backbone(imgs)
+        return preds
+
+    def train_step(self, data_batch):
+        """Training step.
+        """
+        x_data = data_batch[0]
+        gt_iou_map = data_batch[1]
+        gt_start = data_batch[2]
+        gt_end = data_batch[3]
+        gt_iou_map.stop_gradient = True
+        gt_start.stop_gradient = True
+        gt_end.stop_gradient = True
+
+        # call Model forward
+        pred_bm, pred_start, pred_end = self.forward_net(x_data)
+        # call Loss forward
+        loss = self.loss(pred_bm, pred_start, pred_end, gt_iou_map, gt_start,
+                         gt_end)
+        avg_loss = paddle.mean(loss)
+        loss_metrics = dict()
+        loss_metrics['loss'] = avg_loss
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        """Validating setp.
+        """
+        return self.train_step(data_batch)
+
+    def test_step(self, data_batch):
+        """Test step.
+        """
+        x_data = data_batch[0]
+        pred_bm, pred_start, pred_end = self.forward_net(x_data)
+        return pred_bm, pred_start, pred_end
+
+    def infer_step(self, data_batch):
+        """Infer step
+        """
+        x_data = data_batch[0]
+
+        # call Model forward
+        pred_bm, pred_start, pred_end = self.forward_net(x_data)
+        return pred_bm, pred_start, pred_end
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/yowo_localizer.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/yowo_localizer.py
new file mode 100644
index 0000000..c3613c6
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/yowo_localizer.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import LOCALIZERS
+from .base import BaseLocalizer
+from .yowo_utils import truths_length, nms, get_region_boxes, bbox_iou
+
+
+@LOCALIZERS.register()
+class YOWOLocalizer(BaseLocalizer):
+    """YOWO Localization framework
+    """
+
+    def forward_net(self, imgs):
+        """Call backbone forward.
+        """
+        # imgs.shape=[N,C,T,H,W], for YOWO
+        preds = self.backbone(imgs)
+        return preds
+
+    def train_step(self, data_batch):
+        """Training step.
+        """
+        x_data = data_batch[0]
+        target = data_batch[1].squeeze(1)  # indeed do squeeze to adapt to paddle tensor
+        target.stop_gradient = True
+
+        # call Model forward
+        out = self.forward_net(x_data)
+        # call Loss forward
+        loss, nCorrect = self.loss(out, target)
+        loss_metrics = dict()
+        loss_metrics['loss'] = loss
+        loss_metrics['nCorrect'] = nCorrect
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        """Validating setp.
+        """
+        total = 0.0
+        proposals = 0.0
+        correct = 0.0
+        fscore = 0.0
+        eps = 1e-5
+        nms_thresh = 0.4
+        iou_thresh = 0.5
+
+        x_data = data_batch[0]
+        target = data_batch[1].squeeze(1)  # indeed do squeeze to adapt to paddle tensor
+        frame_idx = data_batch[2]
+        target.stop_gradient = True
+        # call Model forward
+        out = self.forward_net(x_data)
+        all_boxes = get_region_boxes(out)
+        out_boxes = []
+
+        for i in range(out.shape[0]):
+            boxes = all_boxes[i]
+            boxes = nms(boxes, nms_thresh)
+            out_boxes.append(boxes)
+            truths = target[i].reshape([-1, 5])
+            num_gts = truths_length(truths)
+            total = total + num_gts
+            pred_list = []
+            for i in range(len(boxes)):
+                if boxes[i][4] > 0.25:
+                    proposals = proposals + 1
+                    pred_list.append(i)
+            for i in range(num_gts):
+                box_gt = [truths[i][1], truths[i][2], truths[i][3], truths[i][4], 1.0, 1.0, truths[i][0]]
+                best_iou = 0
+                best_j = -1
+                for j in pred_list:  # ITERATE THROUGH ONLY CONFIDENT BOXES
+                    iou = bbox_iou(box_gt, boxes[j], x1y1x2y2=False)
+                    if iou > best_iou:
+                        best_j = j
+                        best_iou = iou
+                if best_iou > iou_thresh and int(boxes[best_j][6]) == box_gt[6]:
+                    correct = correct + 1
+
+        precision = 1.0 * correct / (proposals + eps)
+        recall = 1.0 * correct / (total + eps)
+        fscore = 2.0 * precision * recall / (precision + recall + eps)
+
+        outs = dict()
+        outs['precision'] = precision
+        outs['recall'] = recall
+        outs['fscore'] = fscore
+        outs['frame_idx'] = frame_idx
+        return outs
+
+    def test_step(self, data_batch):
+        """Test step.
+        """
+        total = 0.0
+        proposals = 0.0
+        correct = 0.0
+        fscore = 0.0
+        eps = 1e-5
+        nms_thresh = 0.4
+        iou_thresh = 0.5
+
+        x_data = data_batch[0]
+        target = data_batch[1].squeeze(1)  # indeed do squeeze to adapt to paddle tensor
+        frame_idx = data_batch[2]
+        target.stop_gradient = True
+        # call Model forward
+        out = self.forward_net(x_data)
+        all_boxes = get_region_boxes(out)
+        out_boxes = []
+
+        for i in range(out.shape[0]):
+            boxes = all_boxes[i]
+            boxes = nms(boxes, nms_thresh)
+            out_boxes.append(boxes)
+            truths = target[i].reshape([-1, 5])
+            num_gts = truths_length(truths)
+            total = total + num_gts
+            pred_list = []
+            for i in range(len(boxes)):
+                if boxes[i][4] > 0.25:
+                    proposals = proposals + 1
+                    pred_list.append(i)
+            for i in range(num_gts):
+                box_gt = [truths[i][1], truths[i][2], truths[i][3], truths[i][4], 1.0, 1.0, truths[i][0]]
+                best_iou = 0
+                best_j = -1
+                for j in pred_list:  # ITERATE THROUGH ONLY CONFIDENT BOXES
+                    iou = bbox_iou(box_gt, boxes[j], x1y1x2y2=False)
+                    if iou > best_iou:
+                        best_j = j
+                        best_iou = iou
+                if best_iou > iou_thresh and int(boxes[best_j][6]) == box_gt[6]:
+                    correct = correct + 1
+
+        precision = 1.0 * correct / (proposals + eps)
+        recall = 1.0 * correct / (total + eps)
+        fscore = 2.0 * precision * recall / (precision + recall + eps)
+
+        outs = dict()
+        outs['boxes'] = out_boxes
+        outs['precision'] = precision
+        outs['recall'] = recall
+        outs['fscore'] = fscore
+        outs['frame_idx'] = frame_idx
+        return outs
+
+    def infer_step(self, data_batch):
+        """Infer step.
+        """
+        out = self.forward_net(data_batch[0])
+        return out
\ No newline at end of file
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/yowo_utils.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/yowo_utils.py
new file mode 100644
index 0000000..9f0e016
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/yowo_utils.py
@@ -0,0 +1,359 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn as nn
+import numpy as np
+from builtins import range as xrange
+
+
+def truths_length(truths):
+    for i in range(50):
+        if truths[i][1] == 0:
+            return i
+
+
+def nms(boxes, nms_thresh):
+    if len(boxes) == 0:
+        return boxes
+
+    det_confs = paddle.zeros([len(boxes)])
+    for i in range(len(boxes)):
+        det_confs[i] = 1 - boxes[i][4]
+
+    sortIds = paddle.argsort(det_confs)
+    out_boxes = []
+    for i in range(len(boxes)):
+        box_i = boxes[sortIds[i]]
+        if box_i[4] > 0:
+            out_boxes.append(box_i)
+            for j in range(i + 1, len(boxes)):
+                box_j = boxes[sortIds[j]]
+                if bbox_iou(box_i, box_j, x1y1x2y2=False) > nms_thresh:
+                    box_j[4] = 0
+    return out_boxes
+
+
+def convert2cpu(gpu_matrix):
+    float_32_g = gpu_matrix.astype('float32')
+    return float_32_g.cpu()
+
+
+def convert2cpu_long(gpu_matrix):
+    int_64_g = gpu_matrix.astype('int64')
+    return int_64_g.cpu()
+
+
+def get_region_boxes(output, conf_thresh=0.005, num_classes=24,
+                     anchors=[0.70458, 1.18803, 1.26654, 2.55121, 1.59382,
+                              4.08321, 2.30548, 4.94180, 3.52332, 5.91979],
+                     num_anchors=5, only_objectness=1, validation=False):
+    anchor_step = len(anchors) // num_anchors
+    if output.dim() == 3:
+        output = output.unsqueeze(0)
+    batch = output.shape[0]
+    assert (output.shape[1] == (5 + num_classes) * num_anchors)
+    h = output.shape[2]
+    w = output.shape[3]
+    all_boxes = []
+    output = paddle.reshape(
+        output, [batch * num_anchors, 5 + num_classes, h * w])
+    output = paddle.transpose(output, (1, 0, 2))
+    output = paddle.reshape(
+        output, [5 + num_classes, batch * num_anchors * h * w])
+
+    grid_x = paddle.linspace(0, w - 1, w)
+    grid_x = paddle.tile(grid_x, [h, 1])
+    grid_x = paddle.tile(grid_x, [batch * num_anchors, 1, 1])
+    grid_x = paddle.reshape(grid_x, [batch * num_anchors * h * w]).cuda()
+
+    grid_y = paddle.linspace(0, h - 1, h)
+    grid_y = paddle.tile(grid_y, [w, 1]).t()
+    grid_y = paddle.tile(grid_y, [batch * num_anchors, 1, 1])
+    grid_y = paddle.reshape(grid_y, [batch * num_anchors * h * w]).cuda()
+
+    sigmoid = nn.Sigmoid()
+    xs = sigmoid(output[0]) + grid_x
+    ys = sigmoid(output[1]) + grid_y
+
+    anchor_w = paddle.to_tensor(anchors)
+    anchor_w = paddle.reshape(anchor_w, [num_anchors, anchor_step])
+    anchor_w = paddle.index_select(anchor_w, index=paddle.to_tensor(
+        np.array([0]).astype('int32')), axis=1)
+
+    anchor_h = paddle.to_tensor(anchors)
+    anchor_h = paddle.reshape(anchor_h, [num_anchors, anchor_step])
+    anchor_h = paddle.index_select(anchor_h, index=paddle.to_tensor(
+        np.array([1]).astype('int32')), axis=1)
+
+    anchor_w = paddle.tile(anchor_w, [batch, 1])
+    anchor_w = paddle.tile(anchor_w, [1, 1, h * w])
+    anchor_w = paddle.reshape(anchor_w, [batch * num_anchors * h * w]).cuda()
+
+    anchor_h = paddle.tile(anchor_h, [batch, 1])
+    anchor_h = paddle.tile(anchor_h, [1, 1, h * w])
+    anchor_h = paddle.reshape(anchor_h, [batch * num_anchors * h * w]).cuda()
+
+    ws = paddle.exp(output[2]) * anchor_w
+    hs = paddle.exp(output[3]) * anchor_h
+
+    det_confs = sigmoid(output[4])
+
+    cls_confs = paddle.to_tensor(output[5:5 + num_classes], stop_gradient=True)
+    cls_confs = paddle.transpose(cls_confs, [1, 0])
+    s = nn.Softmax()
+    cls_confs = paddle.to_tensor(s(cls_confs))
+
+    cls_max_confs = paddle.max(cls_confs, axis=1)
+    cls_max_ids = paddle.argmax(cls_confs, axis=1)
+
+    cls_max_confs = paddle.reshape(cls_max_confs, [-1])
+    cls_max_ids = paddle.reshape(cls_max_ids, [-1])
+
+    sz_hw = h * w
+    sz_hwa = sz_hw * num_anchors
+
+    det_confs = convert2cpu(det_confs)
+    cls_max_confs = convert2cpu(cls_max_confs)
+    cls_max_ids = convert2cpu_long(cls_max_ids)
+    xs = convert2cpu(xs)
+    ys = convert2cpu(ys)
+    ws = convert2cpu(ws)
+    hs = convert2cpu(hs)
+    if validation:
+        cls_confs = convert2cpu(cls_confs.reshape([-1, num_classes]))
+    for b in range(batch):
+        boxes = []
+        for cy in range(h):
+            for cx in range(w):
+                for i in range(num_anchors):
+                    ind = b * sz_hwa + i * sz_hw + cy * w + cx
+                    det_conf = det_confs[ind]
+                    if only_objectness:
+                        conf = det_confs[ind]
+                    else:
+                        conf = det_confs[ind] * cls_max_confs[ind]
+
+                    if conf > conf_thresh:
+                        bcx = xs[ind]
+                        bcy = ys[ind]
+                        bw = ws[ind]
+                        bh = hs[ind]
+                        cls_max_conf = cls_max_confs[ind]
+                        cls_max_id = cls_max_ids[ind]
+                        box = [bcx / w, bcy / h, bw / w, bh / h,
+                               det_conf, cls_max_conf, cls_max_id]
+                        if (not only_objectness) and validation:
+                            for c in range(num_classes):
+                                tmp_conf = cls_confs[ind][c]
+                                if c != cls_max_id and det_confs[ind] * tmp_conf > conf_thresh:
+                                    box.append(tmp_conf)
+                                    box.append(c)
+                        boxes.append(box)
+        all_boxes.append(boxes)
+    return all_boxes
+
+
+def bbox_iou(box1, box2, x1y1x2y2=True):
+    if x1y1x2y2:
+        mx = min(box1[0], box2[0])
+        Mx = max(box1[2], box2[2])
+        my = min(box1[1], box2[1])
+        My = max(box1[3], box2[3])
+        w1 = box1[2] - box1[0]
+        h1 = box1[3] - box1[1]
+        w2 = box2[2] - box2[0]
+        h2 = box2[3] - box2[1]
+    else:
+        mx = min(float(box1[0] - box1[2] / 2.0),
+                 float(box2[0] - box2[2] / 2.0))
+        Mx = max(float(box1[0] + box1[2] / 2.0),
+                 float(box2[0] + box2[2] / 2.0))
+        my = min(float(box1[1] - box1[3] / 2.0),
+                 float(box2[1] - box2[3] / 2.0))
+        My = max(float(box1[1] + box1[3] / 2.0),
+                 float(box2[1] + box2[3] / 2.0))
+        w1 = box1[2]
+        h1 = box1[3]
+        w2 = box2[2]
+        h2 = box2[3]
+    uw = Mx - mx
+    uh = My - my
+    cw = w1 + w2 - uw
+    ch = h1 + h2 - uh
+    carea = 0
+    if cw <= 0 or ch <= 0:
+        return paddle.to_tensor(0.0)
+
+    area1 = w1 * h1
+    area2 = w2 * h2
+    carea = cw * ch
+    uarea = area1 + area2 - carea
+    return carea / uarea
+
+
+def bbox_ious(boxes1, boxes2, x1y1x2y2=True):
+    if x1y1x2y2:
+        mx = paddle.min(boxes1[0], boxes2[0])
+        Mx = paddle.max(boxes1[2], boxes2[2])
+        my = paddle.min(boxes1[1], boxes2[1])
+        My = paddle.max(boxes1[3], boxes2[3])
+        w1 = boxes1[2] - boxes1[0]
+        h1 = boxes1[3] - boxes1[1]
+        w2 = boxes2[2] - boxes2[0]
+        h2 = boxes2[3] - boxes2[1]
+    else:
+        mx = paddle.min(paddle.stack(
+            [boxes1[0] - boxes1[2] / 2.0, boxes2[0] - boxes2[2] / 2.0], axis=0), axis=0)
+        Mx = paddle.max(paddle.stack(
+            [boxes1[0] + boxes1[2] / 2.0, boxes2[0] + boxes2[2] / 2.0], axis=0), axis=0)
+        my = paddle.min(paddle.stack(
+            [boxes1[1] - boxes1[3] / 2.0, boxes2[1] - boxes2[3] / 2.0], axis=0), axis=0)
+        My = paddle.max(paddle.stack(
+            [boxes1[1] + boxes1[3] / 2.0, boxes2[1] + boxes2[3] / 2.0], axis=0), axis=0)
+        w1 = boxes1[2]
+        h1 = boxes1[3]
+        w2 = boxes2[2]
+        h2 = boxes2[3]
+    uw = Mx - mx
+    uh = My - my
+    cw = w1 + w2 - uw
+    ch = h1 + h2 - uh
+    mask = paddle.cast(cw <= 0, dtype="int32") + \
+        paddle.cast(ch <= 0, dtype="int32") > 0
+    area1 = w1 * h1
+    area2 = w2 * h2
+    carea = cw * ch
+    carea[mask] = 0
+    uarea = area1 + area2 - carea
+    return carea / uarea
+
+
+# this function works for building the groud truth
+def build_targets(pred_boxes, target, anchors, num_anchors, num_classes, nH, nW, noobject_scale, object_scale,
+                  sil_thresh):
+    # nH, nW here are number of grids in y and x directions (7, 7 here)
+    nB = target.shape[0]  # batch size
+    nA = num_anchors  # 5 for our case
+    nC = num_classes
+    anchor_step = len(anchors) // num_anchors
+    conf_mask = paddle.ones([nB, nA, nH, nW]) * noobject_scale
+    coord_mask = paddle.zeros([nB, nA, nH, nW])
+    cls_mask = paddle.zeros([nB, nA, nH, nW])
+    tx = paddle.zeros([nB, nA, nH, nW])
+    ty = paddle.zeros([nB, nA, nH, nW])
+    tw = paddle.zeros([nB, nA, nH, nW])
+    th = paddle.zeros([nB, nA, nH, nW])
+    tconf = paddle.zeros([nB, nA, nH, nW])
+    tcls = paddle.zeros([nB, nA, nH, nW])
+
+    # for each grid there are nA anchors
+    # nAnchors is the number of anchor for one image
+    nAnchors = nA * nH * nW
+    nPixels = nH * nW
+    # for each image
+    for b in xrange(nB):
+        # get all anchor boxes in one image
+        # (4 * nAnchors)
+        cur_pred_boxes = pred_boxes[b * nAnchors:(b + 1) * nAnchors].t()
+        # initialize iou score for each anchor
+        cur_ious = paddle.zeros([nAnchors])
+        for t in xrange(50):
+            # for each anchor 4 coordinate parameters, already in the coordinate system for the whole image
+            # this loop is for anchors in each image
+            # for each anchor 5 parameters are available (class, x, y, w, h)
+            if target[b][t * 5 + 1] == 0:
+                break
+            gx = target[b][t * 5 + 1] * nW
+            gy = target[b][t * 5 + 2] * nH
+            gw = target[b][t * 5 + 3] * nW
+            gh = target[b][t * 5 + 4] * nH
+            # groud truth boxes
+            cur_gt_boxes = paddle.tile(paddle.to_tensor(
+                [gx, gy, gw, gh], dtype='float32').t(), [nAnchors, 1]).t()
+            # bbox_ious is the iou value between orediction and groud truth
+            cur_ious = paddle.max(
+                paddle.stack([cur_ious, bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False)], axis=0), axis=0)
+        # if iou > a given threshold, it is seen as it includes an object
+        # conf_mask[b][cur_ious>sil_thresh] = 0
+        conf_mask_t = paddle.reshape(conf_mask, [nB, -1])
+        conf_mask_t[b, cur_ious > sil_thresh] = 0
+        conf_mask_tt = paddle.reshape(conf_mask_t[b], [nA, nH, nW])
+        conf_mask[b] = conf_mask_tt
+
+    # number of ground truth
+    nGT = 0
+    nCorrect = 0
+    for b in xrange(nB):
+        # anchors for one batch (at least batch size, and for some specific classes, there might exist more than one anchor)
+        for t in xrange(50):
+            if target[b][t * 5 + 1] == 0:
+                break
+            nGT = nGT + 1
+            best_iou = 0.0
+            best_n = -1
+            min_dist = 10000
+            # the values saved in target is ratios
+            # times by the width and height of the output feature maps nW and nH
+            gx = target[b][t * 5 + 1] * nW
+            gy = target[b][t * 5 + 2] * nH
+            gi = int(gx)
+            gj = int(gy)
+
+            gw = target[b][t * 5 + 3] * nW
+            gh = target[b][t * 5 + 4] * nH
+            gt_box = [0, 0, gw, gh]
+            for n in xrange(nA):
+                # get anchor parameters (2 values)
+                aw = anchors[anchor_step * n]
+                ah = anchors[anchor_step * n + 1]
+                anchor_box = [0, 0, aw, ah]
+                # only consider the size (width and height) of the anchor box
+                iou = bbox_iou(anchor_box, gt_box, x1y1x2y2=False)
+                # get the best anchor form with the highest iou
+                if iou > best_iou:
+                    best_iou = iou
+                    best_n = n
+
+            # then we determine the parameters for an anchor (4 values together)
+            gt_box = [gx, gy, gw, gh]
+            # find corresponding prediction box
+            pred_box = pred_boxes[b * nAnchors +
+                                  best_n * nPixels + gj * nW + gi]
+
+            # only consider the best anchor box, for each image
+            coord_mask[b, best_n, gj, gi] = 1
+            cls_mask[b, best_n, gj, gi] = 1
+
+            # in this cell of the output feature map, there exists an object
+            conf_mask[b, best_n, gj, gi] = object_scale
+            tx[b, best_n, gj, gi] = paddle.cast(
+                target[b][t * 5 + 1] * nW - gi, dtype='float32')
+            ty[b, best_n, gj, gi] = paddle.cast(
+                target[b][t * 5 + 2] * nH - gj, dtype='float32')
+            tw[b, best_n, gj, gi] = math.log(
+                gw / anchors[anchor_step * best_n])
+            th[b, best_n, gj, gi] = math.log(
+                gh / anchors[anchor_step * best_n + 1])
+            iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False)  # best_iou
+            # confidence equals to iou of the corresponding anchor
+            tconf[b, best_n, gj, gi] = paddle.cast(iou, dtype='float32')
+            tcls[b, best_n, gj, gi] = paddle.cast(
+                target[b][t * 5], dtype='float32')
+            # if ious larger than 0.5, we justify it as a correct prediction
+            if iou > 0.5:
+                nCorrect = nCorrect + 1
+    # true values are returned
+    return nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__init__.py
new file mode 100644
index 0000000..e1efec3
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from .base import BaseMultimodal
+from .actbert import ActBert
+
+__all__ = ['BaseMultimodal', 'ActBert']
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..107d9c8
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__pycache__/actbert.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__pycache__/actbert.cpython-310.pyc
new file mode 100644
index 0000000..4f09ee5
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__pycache__/actbert.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000..a2e4620
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__pycache__/base.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/actbert.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/actbert.py
new file mode 100644
index 0000000..4f2c074
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/actbert.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import MULTIMODAL
+from .base import BaseMultimodal
+import paddle
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@MULTIMODAL.register()
+class ActBert(BaseMultimodal):
+    """ActBert model framework."""
+    def forward_net(self, text_ids, action_feat, image_feat, image_loc,
+                    token_type_ids, text_mask, image_mask, action_mask):
+        pred = self.backbone(text_ids, action_feat, image_feat, image_loc,
+                             token_type_ids, text_mask, image_mask, action_mask)
+        return pred
+
+    def train_step(self, data_batch):
+        """For ActBert Dataset. Define how the model is going to train, from input to output.
+        """
+        text_ids, action_feat, image_feat, image_loc, \
+        token_type_ids, text_mask, image_mask, action_mask, \
+        text_labels, action_label, next_sentence_label, image_label, image_target = data_batch
+        loss_metrics = dict()
+        pred = self.backbone(text_ids, action_feat, image_feat, image_loc,
+                             token_type_ids, text_mask, image_mask, action_mask)
+        prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score = pred
+        total_loss = self.loss(prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score, \
+                text_labels, image_label, image_target, action_label, next_sentence_label)
+        loss_metrics['loss'] = paddle.mean(total_loss)
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        """For ActBert Dataset. Define how the model is going to val, from input to output.
+        """
+        return self.train_step(data_batch)
+
+    def test_step(self, data_batch):
+        """For MSR-VTT Dataset. Define how the model is going to test, from input to output."""
+        text_ids, action_feat, image_feat, image_loc, token_type_ids, text_mask, image_mask, action_mask = data_batch[:
+                                                                                                                      -1]
+        action_feat = action_feat.squeeze(0)
+        image_feat = image_feat.squeeze(0)
+        image_loc = image_loc.squeeze(0)
+        image_mask = image_mask.squeeze(0)
+        action_mask = action_mask.squeeze(0)
+        prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score = self.forward_net(text_ids, \
+            action_feat, image_feat, image_loc, token_type_ids, text_mask, image_mask, action_mask)
+        return prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score
+
+    def infer_step(self, data_batch):
+        pass
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/base.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/base.py
new file mode 100644
index 0000000..bc57f97
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/base.py
@@ -0,0 +1,81 @@
+from abc import abstractmethod
+from ... import builder
+import paddle.nn as nn
+
+
+class BaseMultimodal(nn.Layer):
+    """Base class for Multimodal.
+
+    All Multimodal model should subclass it.
+    All subclass should overwrite:
+
+    - Methods:``train_step``, supporting to forward when training.
+    - Methods:``valid_step``, supporting to forward when validating.
+    - Methods:``test_step``, supporting to forward when testing.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        head (dict): Head to process feature.
+        loss(dict): Loss function.
+
+    """
+    def __init__(self, backbone=None, head=None, loss=None):
+        super().__init__()
+        if backbone is not None:
+            self.backbone = builder.build_backbone(backbone)
+            if hasattr(self.backbone, 'init_weights'):
+                self.backbone.init_weights()
+        else:
+            self.backbone = None
+        if head is not None:
+            self.head_name = head.name
+            self.head = builder.build_head(head)
+            if hasattr(self.head, 'init_weights'):
+                self.head.init_weights()
+        else:
+            self.head = None
+        if loss is not None:
+            self.loss = builder.build_loss(loss)
+        else:
+            self.loss = None
+
+    def forward(self, data_batch, mode='infer'):
+        """
+        1. Define how the model is going to run, from input to output.
+        2. Console of train, valid, test or infer step
+        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py
+        """
+        if mode == 'train':
+            return self.train_step(data_batch)
+        elif mode == 'valid':
+            return self.val_step(data_batch)
+        elif mode == 'test':
+            return self.test_step(data_batch)
+        elif mode == 'infer':
+            return self.infer_step(data_batch)
+        else:
+            raise NotImplementedError
+
+    @abstractmethod
+    def train_step(self, data_batch, **kwargs):
+        """Training step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def val_step(self, data_batch, **kwargs):
+        """Validating step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def test_step(self, data_batch, **kwargs):
+        """Test step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def infer_step(self, data_batch, **kwargs):
+        """Infer step.
+        """
+        raise NotImplementedError
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__init__.py
new file mode 100644
index 0000000..0c6de50
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__init__.py
@@ -0,0 +1,18 @@
+# copyright (c) 2020  paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license"
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+from .base import BasePartitioner
+from .transnetv2_partitioner import TransNetV2Partitioner
+
+__all__ = ['BasePartitioner', 'TransNetV2Partitioner']
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..ac5d9bc
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000..8194832
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__pycache__/base.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__pycache__/transnetv2_partitioner.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__pycache__/transnetv2_partitioner.cpython-310.pyc
new file mode 100644
index 0000000..d9faf4a
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__pycache__/transnetv2_partitioner.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/base.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/base.py
new file mode 100644
index 0000000..a7c9259
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/base.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+import paddle.nn as nn
+from ... import builder
+
+
+class BasePartitioner(nn.Layer):
+    """Base class for Partition.
+    All partitioner should subclass it.
+    All subclass should overwrite:
+    - Methods:``train_step``, define your train step.
+    - Methods:``valid_step``, define your valid step, always the same as train_step.
+    - Methods:``test_step``, define your test step.
+    """
+    def __init__(self, backbone=None, head=None):
+        super().__init__()
+        if backbone is not None:
+            self.backbone = builder.build_backbone(backbone)
+            if hasattr(self.backbone, 'init_weights'):
+                self.backbone.init_weights()
+        else:
+            self.backbone = None
+        if head is not None:
+            self.head_name = head.name
+            self.head = builder.build_head(head)
+            if hasattr(self.head, 'init_weights'):
+                self.head.init_weights()
+        else:
+            self.head = None
+
+    def init_weights(self):
+        """Initialize the model network weights. """
+        if getattr(self.backbone, 'init_weights'):
+            self.backbone.init_weights()
+        else:
+            pass
+
+    def forward(self, data_batch, mode='infer'):
+        """
+        1. Define how the model is going to run, from input to output.
+        2. Console of train, valid, test or infer step
+        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py
+        """
+        if mode == 'train':
+            return self.train_step(data_batch)
+        elif mode == 'valid':
+            return self.val_step(data_batch)
+        elif mode == 'test':
+            return self.test_step(data_batch)
+        elif mode == 'infer':
+            return self.infer_step(data_batch)
+        else:
+            raise NotImplementedError
+
+    @abstractmethod
+    def train_step(self, data_batch, **kwargs):
+        """Training step.  input_data_batch -> loss_metric
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def val_step(self, data_batch, **kwargs):
+        """Validating setp. input_data_batch -> loss_metric
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def test_step(self, data_batch, **kwargs):
+        """Tets setp. to get acc in test data. input_data_batch -> output
+        """
+        raise NotImplementedError
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py
new file mode 100644
index 0000000..c329506
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import PARTITIONERS
+from .base import BasePartitioner
+
+import paddle
+
+
+@PARTITIONERS.register()
+class TransNetV2Partitioner(BasePartitioner):
+    """TransNetV2 Partitioner framework
+    """
+    def forward_net(self, imgs):
+        one_hot_pred = self.backbone(imgs)
+        return one_hot_pred
+
+    def train_step(self, data_batch):
+        """Define how the model is going to train, from input to output.
+        """
+        frame_sequence = data_batch[0]
+        one_hot_gt, many_hot_gt = data_batch[1:]
+        one_hot_pred = self.forward_net(frame_sequence)
+        dict_ = {}
+        if isinstance(one_hot_pred, tuple):
+            one_hot_pred, dict_ = one_hot_pred
+        many_hot_pred = dict_.get("many_hot", None)
+        comb_reg_loss = dict_.get("comb_reg_loss", None)
+        loss_metrics = self.head.loss(one_hot_pred, one_hot_gt,
+                                    many_hot_pred, many_hot_gt,
+                                    reg_losses={"comb_reg": comb_reg_loss})
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        frame_sequence = data_batch[0]
+        one_hot_gt, many_hot_gt = data_batch[1:]
+        one_hot_pred = self.forward_net(frame_sequence)
+        dict_ = {}
+        if isinstance(one_hot_pred, tuple):
+            one_hot_pred, dict_ = one_hot_pred
+        many_hot_pred = dict_.get("many_hot", None)
+        comb_reg_loss = dict_.get("comb_reg_loss", None)
+        loss_metrics = self.head.loss(one_hot_pred, one_hot_gt,
+                                      many_hot_pred, many_hot_gt,
+                                      reg_losses={"comb_reg": comb_reg_loss})
+        return loss_metrics
+
+    def test_step(self, data_batch):
+        """Define how the model is going to test, from input to output."""
+        # NOTE: (shipping) when testing, the net won't call head.loss, we deal with the test processing in /paddlevideo/metrics
+        frame_sequence = data_batch[0]
+        one_hot_pred = self.forward_net(frame_sequence)
+        return one_hot_pred
+
+    def infer_step(self, data_batch):
+        """Define how the model is going to test, from input to output."""
+        frame_sequence = data_batch[0]
+        one_hot_pred = self.forward_net(frame_sequence)
+        return one_hot_pred
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__init__.py
new file mode 100644
index 0000000..764b37f
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__init__.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from .base import BaseRecognizer
+from .recognizer1d import Recognizer1D, RecognizerAction
+from .recognizer2d import Recognizer2D
+from .recognizer3d import Recognizer3D
+from .recognizer_transformer import RecognizerTransformer
+from .recognizer_gcn import RecognizerGCN
+from .recognizerMRI import RecognizerMRI
+from .recognizer3dMRI import Recognizer3DMRI
+from .recognizer_transformer_MRI import RecognizerTransformer_MRI
+from .recognizer_movinet_frame import MoViNetRecognizerFrame
+from .recognizerDistillation import RecognizerDistillation
+
+__all__ = [
+    'BaseRecognizer', 'Recognizer1D', 'Recognizer2D', 'Recognizer3D',
+    'RecognizerTransformer', 'RecognizerGCN', 'RecognizerMRI',
+    'Recognizer3DMRI', 'RecognizerTransformer_MRI', 'MoViNetRecognizerFrame',
+    'RecognizerAction', 'RecognizerDistillation'
+]
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..f671541
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000..fec0174
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/base.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer1d.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer1d.cpython-310.pyc
new file mode 100644
index 0000000..57a4caf
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer1d.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer2d.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer2d.cpython-310.pyc
new file mode 100644
index 0000000..6181368
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer2d.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer3d.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer3d.cpython-310.pyc
new file mode 100644
index 0000000..cd733c4
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer3d.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer3dMRI.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer3dMRI.cpython-310.pyc
new file mode 100644
index 0000000..7e1f46b
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer3dMRI.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizerDistillation.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizerDistillation.cpython-310.pyc
new file mode 100644
index 0000000..af8ba91
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizerDistillation.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizerMRI.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizerMRI.cpython-310.pyc
new file mode 100644
index 0000000..efac30d
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizerMRI.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_gcn.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_gcn.cpython-310.pyc
new file mode 100644
index 0000000..57257fd
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_gcn.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_movinet_frame.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_movinet_frame.cpython-310.pyc
new file mode 100644
index 0000000..0f82b4f
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_movinet_frame.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_transformer.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_transformer.cpython-310.pyc
new file mode 100644
index 0000000..0ad9fce
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_transformer.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_transformer_MRI.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_transformer_MRI.cpython-310.pyc
new file mode 100644
index 0000000..197d673
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_transformer_MRI.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/base.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/base.py
new file mode 100644
index 0000000..bf31caf
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/base.py
@@ -0,0 +1,81 @@
+from abc import abstractmethod
+from ... import builder
+import paddle.nn as nn
+
+
+class BaseRecognizer(nn.Layer):
+    """Base class for recognizers.
+
+    All recognizers should subclass it.
+    All subclass should overwrite:
+
+    - Methods:``train_step``, supporting to forward when training.
+    - Methods:``valid_step``, supporting to forward when validating.
+    - Methods:``test_step``, supporting to forward when testing.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        head (dict): Classification head to process feature.
+
+    """
+    def __init__(self, backbone=None, head=None, runtime_cfg=None):
+
+        super().__init__()
+        if backbone is not None:
+            self.backbone = builder.build_backbone(backbone)
+            if hasattr(self.backbone, 'init_weights'):
+                self.backbone.init_weights()
+        else:
+            self.backbone = None
+        if head is not None:
+            self.head_name = head.name
+            self.head = builder.build_head(head)
+            if hasattr(self.head, 'init_weights'):
+                self.head.init_weights()
+        else:
+            self.head = None
+
+        # Settings when the model is running,
+        # such as 'avg_type'
+        self.runtime_cfg = runtime_cfg
+
+    def forward(self, data_batch, mode='infer'):
+        """
+        1. Define how the model is going to run, from input to output.
+        2. Console of train, valid, test or infer step
+        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py
+        """
+        if mode == 'train':
+            return self.train_step(data_batch)
+        elif mode == 'valid':
+            return self.val_step(data_batch)
+        elif mode == 'test':
+            return self.test_step(data_batch)
+        elif mode == 'infer':
+            return self.infer_step(data_batch)
+        else:
+            raise NotImplementedError
+
+    @abstractmethod
+    def train_step(self, data_batch, **kwargs):
+        """Training step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def val_step(self, data_batch, **kwargs):
+        """Validating step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def test_step(self, data_batch, **kwargs):
+        """Test step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def infer_step(self, data_batch, **kwargs):
+        """Infer step.
+        """
+        raise NotImplementedError
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer1d.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer1d.py
new file mode 100644
index 0000000..2c7fa94
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer1d.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+
+
+@RECOGNIZERS.register()
+class Recognizer1D(BaseRecognizer):
+    """1D recognizer model framework."""
+    def forward_net(self, imgs):
+        """Define how the model is going to train, from input to output.
+        """
+        lstm_logit, lstm_output = self.head(imgs)
+        return lstm_logit, lstm_output
+
+    def train_step(self, data_batch):
+        """Training step.
+        """
+        rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask, labels = data_batch
+        imgs = [(rgb_data, rgb_len, rgb_mask),
+                (audio_data, audio_len, audio_mask)]
+
+        # call forward
+        lstm_logit, lstm_output = self.forward_net(imgs)
+        loss = self.head.loss(lstm_logit, labels)
+        hit_at_one, perr, gap = self.head.metric(lstm_output, labels)
+        loss_metrics = dict()
+        loss_metrics['loss'] = loss
+        loss_metrics['hit_at_one'] = hit_at_one
+        loss_metrics['perr'] = perr
+        loss_metrics['gap'] = gap
+
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        """Validating setp.
+        """
+        return self.train_step(data_batch)
+
+    def test_step(self, data_batch):
+        """Testing setp.
+        """
+        return self.train_step(data_batch)
+
+    def infer_step(self, data_batch):
+        """Infering setp.
+        """
+        rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask = data_batch
+        imgs = [(rgb_data, rgb_len, rgb_mask),
+                (audio_data, audio_len, audio_mask)]
+        # call forward
+        lstm_logit, _ = self.forward_net(imgs)
+        return lstm_logit
+
+
+@RECOGNIZERS.register()
+class RecognizerAction(BaseRecognizer):
+    """1D recognizer model framework."""
+    def forward_net(self, imgs):
+        """Define how the model is going to train, from input to output.
+        """
+        lstm_logit, lstm_output = self.head(imgs)
+        return lstm_logit, lstm_output
+
+    def train_step(self, data_batch):
+        """Training step.
+        """
+        rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask, labels, labels_iou = data_batch
+        imgs = [(rgb_data, rgb_len, rgb_mask),
+                (audio_data, audio_len, audio_mask)]
+
+        # call forward
+        output_logit, output_iou = self.forward_net(imgs)
+        loss = self.head.loss(output_logit, output_iou, labels, labels_iou)
+        top1, top5 = self.head.metric(output_logit, labels)
+        loss_metrics = dict()
+        loss_metrics['loss'] = loss
+        loss_metrics['top1'] = top1
+        loss_metrics['top5'] = top5
+
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        """Validating setp.
+        """
+        return self.train_step(data_batch)
+
+    def test_step(self, data_batch):
+        """Testing setp.
+        """
+        return self.train_step(data_batch)
+
+    def infer_step(self, data_batch):
+        """Infering setp.
+        """
+        rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask = data_batch
+        imgs = [(rgb_data, rgb_len, rgb_mask),
+                (audio_data, audio_len, audio_mask)]
+        # call forward
+        output_logit, output_iou = self.forward_net(imgs)
+        return output_logit, output_iou
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer2d.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer2d.py
new file mode 100644
index 0000000..d8aa661
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer2d.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+import paddle
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class Recognizer2D(BaseRecognizer):
+    """2D recognizer model framework."""
+    def forward_net(self, imgs):
+        # NOTE: As the num_segs is an attribute of dataset phase, and didn't pass to build_head phase, should obtain it from imgs(paddle.Tensor) now, then call self.head method.
+        num_segs = imgs.shape[
+            1]  # imgs.shape=[N,T,C,H,W], for most commonly case
+        imgs = paddle.reshape_(imgs, [-1] + list(imgs.shape[2:]))
+
+        if self.backbone is not None:
+            feature = self.backbone(imgs)
+        else:
+            feature = imgs
+
+        if self.head is not None:
+            cls_score = self.head(feature, num_segs)
+        else:
+            cls_score = None
+
+        return cls_score
+
+    def train_step(self, data_batch):
+        """Define how the model is going to train, from input to output.
+        """
+        imgs = data_batch[0]
+        labels = data_batch[1:]
+        cls_score = self.forward_net(imgs)
+        loss_metrics = self.head.loss(cls_score, labels)
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        imgs = data_batch[0]
+        labels = data_batch[1:]
+        cls_score = self.forward_net(imgs)
+        loss_metrics = self.head.loss(cls_score, labels, valid_mode=True)
+        return loss_metrics
+
+    def test_step(self, data_batch):
+        """Define how the model is going to test, from input to output."""
+        # NOTE: (shipping) when testing, the net won't call head.loss, we deal with the test processing in /paddlevideo/metrics
+        imgs = data_batch[0]
+        cls_score = self.forward_net(imgs)
+        return cls_score
+
+    def infer_step(self, data_batch):
+        """Define how the model is going to test, from input to output."""
+        imgs = data_batch[0]
+        cls_score = self.forward_net(imgs)
+        return cls_score
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer3d.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer3d.py
new file mode 100644
index 0000000..f0ecff1
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer3d.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class Recognizer3D(BaseRecognizer):
+    """3D Recognizer model framework.
+    """
+
+    def forward_net(self, imgs):
+        """Define how the model is going to run, from input to output.
+        """
+        feature = self.backbone(imgs)
+        cls_score = self.head(feature)
+        return cls_score
+
+    def train_step(self, data_batch):
+        """Training step.
+        """
+        if self.backbone.__class__.__name__ == 'ResNet3dSlowOnly':
+            imgs = data_batch[0]
+            labels = data_batch[1:]
+            if imgs.dim() == 6:
+                imgs = imgs.reshape([-1] + imgs.shape[2:])
+        else:
+            imgs = data_batch[0:2]
+            labels = data_batch[2:]
+
+        # call forward
+        cls_score = self.forward_net(imgs)
+        loss_metrics = self.head.loss(cls_score, labels)
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        """Validating setp.
+        """
+        if self.backbone.__class__.__name__ == 'ResNet3dSlowOnly':
+            imgs = data_batch[0]
+            labels = data_batch[1:]
+            if imgs.dim() == 6:
+                imgs = imgs.reshape([-1] + imgs.shape[2:])
+        else:
+            imgs = data_batch[0:2]
+            labels = data_batch[2:]
+
+        # call forward
+        cls_score = self.forward_net(imgs)
+        loss_metrics = self.head.loss(cls_score, labels, valid_mode=True)
+        return loss_metrics
+
+    def test_step(self, data_batch):
+        """Test step.
+        """
+        if self.backbone.__class__.__name__ == 'ResNet3dSlowOnly':
+            imgs = data_batch[0]
+            if imgs.dim() == 6:
+                imgs = imgs.reshape([-1] + imgs.shape[2:])
+        else:
+            imgs = data_batch[0:2]
+        # call forward
+        cls_score = self.forward_net(imgs)
+
+        return cls_score
+
+    def infer_step(self, data_batch):
+        """Infer step.
+        """
+        if self.backbone.__class__.__name__ == 'ResNet3dSlowOnly':
+            imgs = data_batch[0]
+            # call forward
+            imgs = imgs.reshape([-1] + imgs.shape[2:])
+            cls_score = self.forward_net(imgs)
+        else:
+            imgs = data_batch[0:2]
+            # call forward
+            cls_score = self.forward_net(imgs)
+
+        return cls_score
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py
new file mode 100644
index 0000000..9298491
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+from paddlevideo.utils import get_logger
+import paddle
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class Recognizer3DMRI(BaseRecognizer):
+    """3D Recognizer model framework.
+    """
+    def forward_net(self, imgs):
+        """Define how the model is going to run, from input to output.
+        """
+
+        imgs[0] = paddle.cast(imgs[0], "float32")
+        imgs[1] = paddle.cast(imgs[1], "float32")
+        imgs[0] = imgs[0].unsqueeze(1)
+        imgs[1] = imgs[1].unsqueeze(1)
+
+        feature = self.backbone(imgs)
+        cls_score = self.head(feature)
+        return cls_score
+
+    def train_step(self, data_batch):
+        """Training step.
+        """
+        imgs = data_batch[0:2]
+        labels = data_batch[2:]
+
+        # call forward
+        cls_score = self.forward_net(imgs)
+        cls_score = paddle.nn.functional.sigmoid(cls_score)
+        loss_metrics = self.head.loss(cls_score, labels, if_top5=False)
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        """Validating setp.
+        """
+        imgs = data_batch[0:2]
+        labels = data_batch[2:]
+
+        # call forward
+        cls_score = self.forward_net(imgs)
+        cls_score = paddle.nn.functional.sigmoid(cls_score)
+        loss_metrics = self.head.loss(cls_score,
+                                      labels,
+                                      valid_mode=True,
+                                      if_top5=False)
+        return loss_metrics
+
+    def test_step(self, data_batch):
+        """Test step.
+        """
+        imgs = data_batch[0:2]
+        # call forward
+        cls_score = self.forward_net(imgs)
+
+        return cls_score
+
+    def infer_step(self, data_batch):
+        """Infer step.
+        """
+        imgs = data_batch[0:2]
+        # call forward
+        cls_score = self.forward_net(imgs)
+
+        return cls_score
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py
new file mode 100644
index 0000000..6f48a08
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py
@@ -0,0 +1,231 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from abc import abstractmethod
+import paddle
+import paddle.nn as nn
+
+from ...registry import RECOGNIZERS
+from ... import builder
+from paddlevideo.utils import get_logger, get_dist_info
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class RecognizerDistillation(nn.Layer):
+    """recognizer Distillation framework."""
+    def __init__(self,
+                 freeze_params_list=None,
+                 models=None,
+                 loss=None,
+                 **kargs):
+        """
+        Args:
+            freeze_params_list: list, set each model is trainable or not
+            models: config of distillaciton model.
+            loss: config of loss list
+        """
+        super().__init__()
+        self.model_list = []
+        self.model_name_list = []
+        self.loss_cfgs = loss
+
+        if freeze_params_list is None:
+            freeze_params_list = [False] * len(models)
+        assert len(freeze_params_list) == len(models)
+
+        # build Teacher and Student model
+        for idx, model_config in enumerate(models):
+            assert len(model_config) == 1
+            key = list(model_config.keys())[0]  #Teacher or Student
+            model_config = model_config[key]
+            model_name = model_config['backbone']['name']
+
+            backbone, head = None, None
+            if model_config.get('backbone'):
+                backbone = builder.build_backbone(model_config['backbone'])
+                if hasattr(backbone, 'init_weights'):
+                    backbone.init_weights()
+            if model_config.get('head'):
+                head = builder.build_head(model_config['head'])
+                if hasattr(head, 'init_weights'):
+                    head.init_weights()
+
+            model = nn.Sequential(backbone, head)
+            logger.info('build distillation {} model done'.format(key))
+            # for add all parameters in nn.Layer class
+            self.model_list.append(self.add_sublayer(key, model))
+            self.model_name_list.append({model_name: key})
+
+            # set model trainable or not
+            if freeze_params_list[idx]:
+                for param in model.parameters():
+                    param.trainable = False
+
+        # build loss: support for loss list
+        self.loss_func_list = []
+        mode_keys = list(loss.keys())
+        for mode in mode_keys:
+            loss_cfgs = loss[mode]
+            for loss_cfg in loss_cfgs:
+                loss_func_dict = {}
+                model_name_pairs = loss_cfg.pop('model_name_pairs')
+                loss_func = builder.build_loss(loss_cfg)
+                loss_func_dict['mode'] = mode
+                loss_func_dict['loss_func'] = loss_func
+                loss_func_dict['model_name_pairs'] = model_name_pairs
+                self.loss_func_list.append(loss_func_dict)
+
+    def forward(self, data_batch, mode='infer'):
+        """
+        1. Define how the model is going to run, from input to output.
+        2. Console of train, valid, test or infer step
+        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py
+        """
+        if mode == 'train':
+            return self.train_step(data_batch)
+        elif mode == 'valid':
+            return self.val_step(data_batch)
+        elif mode == 'test':
+            return self.test_step(data_batch)
+        elif mode == 'infer':
+            return self.infer_step(data_batch)
+        else:
+            raise NotImplementedError
+
+    def get_loss(self, output, labels, mode):
+        """
+        Args:
+            output: dict, output name and its value
+            labels: label of data
+            mode: str, 'Train' or 'Val'
+        """
+        output['GroundTruth'] = labels
+        loss_list = []
+
+        for loss_func_dict in self.loss_func_list:
+            if mode == loss_func_dict['mode']:
+                model_name_pairs = loss_func_dict['model_name_pairs']
+                loss_func = loss_func_dict['loss_func']
+                loss_val = loss_func(output[model_name_pairs[0]],
+                                     output[model_name_pairs[1]])
+                loss_list.append(loss_val)
+
+        total_loss = paddle.add_n(loss_list)
+        return total_loss
+
+    def get_acc(self, scores, labels, mode='Train'):
+        def _get_acc(score, label, mode='Train'):
+            top1 = paddle.metric.accuracy(input=score, label=label, k=1)
+            top5 = paddle.metric.accuracy(input=score, label=label, k=5)
+            _, world_size = get_dist_info()
+            # Deal with multi cards validate
+            if world_size > 1 and mode == 'Val':  #reduce sum when valid
+                top1 = paddle.distributed.all_reduce(
+                    top1, op=paddle.distributed.ReduceOp.SUM) / world_size
+                top5 = paddle.distributed.all_reduce(
+                    top5, op=paddle.distributed.ReduceOp.SUM) / world_size
+            return top1, top5
+
+        if len(labels) == 1:
+            label = labels[0]
+            return _get_acc(scores, label)
+        # Deal with VideoMix
+        elif len(labels) == 3:
+            label_a, label_b, lam = labels
+            top1a, top5a = _get_acc(scores, label_a, mode)
+            top1b, top5b = _get_acc(scores, label_b, mode)
+            top1 = lam * top1a + (1 - lam) * top1b
+            top5 = lam * top5a + (1 - lam) * top5b
+            return top1, top5
+
+    def forward_model(self, imgs, model_name, model):
+        if model_name in ['PPTSM_v2', 'ResNetTweaksTSM']:
+            # [N,T,C,H,W] -> [N*T,C,H,W]
+            imgs = paddle.reshape(imgs, [-1] + list(imgs.shape[2:]))
+
+        return model(imgs)
+
+    def train_step(self, data_batch):
+        """Define how the model is going to train, from input to output.
+        """
+        out = {}
+        loss_metrics = {}
+        imgs = data_batch[0]
+        labels = data_batch[1:]
+
+        for idx, item in enumerate(self.model_name_list):
+            model = self.model_list[idx]
+            model_name = list(item.keys())[0]
+            model_type = item[model_name]  # Teacher or Student
+            out[model_type] = self.forward_model(imgs, model_name, model)
+
+        # out_student, out_teacher
+        loss = self.get_loss(out, labels, 'Train')
+        loss_metrics['loss'] = loss
+        # calculate acc with student output
+        top1, top5 = self.get_acc(out['Student'], labels)
+        loss_metrics['top1'] = top1
+        loss_metrics['top5'] = top5
+
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        out = {}
+        loss_metrics = {}
+        imgs = data_batch[0]
+        labels = data_batch[1:]
+
+        for idx, item in enumerate(self.model_name_list):
+            model = self.model_list[idx]
+            model_name = list(item.keys())[0]
+            model_type = item[model_name]  # Teacher or Student
+            out[model_type] = self.forward_model(imgs, model_name, model)
+
+        # Loss of student with gt:  out_student, label
+        loss = self.get_loss(out, labels, 'Val')
+        loss_metrics['loss'] = loss
+
+        top1, top5 = self.get_acc(out['Student'], labels, 'Val')
+        loss_metrics['top1'] = top1
+        loss_metrics['top5'] = top5
+
+        return loss_metrics
+
+    def test_step(self, data_batch):
+        """Define how the model is going to test, from input to output."""
+        imgs = data_batch[0]
+
+        # Use Student to test
+        for idx, item in enumerate(self.model_name_list):
+            model = self.model_list[idx]
+            model_name = list(item.keys())[0]
+            model_type = item[model_name]  # Teacher or Student
+            if model_type == "Student":
+                out = self.forward_model(imgs, model_name, model)
+
+        return out
+
+    def infer_step(self, data_batch):
+        """Define how the model is going to test, from input to output."""
+        imgs = data_batch[0]
+
+        # Use Student to infer
+        for idx, item in enumerate(self.model_name_list):
+            model = self.model_list[idx]
+            model_name = list(item.keys())[0]
+            model_type = item[model_name]  # Teacher or Student
+            if model_type == "Student":
+                out = self.forward_model(imgs, model_name, model)
+
+        return out
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizerMRI.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizerMRI.py
new file mode 100644
index 0000000..4b1713e
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizerMRI.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+import paddle
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class RecognizerMRI(BaseRecognizer):
+    """2D recognizer model framework."""
+    def forward_net(self, imgs):
+        # NOTE: As the num_segs is an attribute of dataset phase, and didn't pass to build_head phase, should obtain it from imgs(paddle.Tensor) now, then call self.head method.
+        num_segs = imgs.shape[
+            1]  # imgs.shape=[N,T,C,H,W], for most commonly case
+        imgs = paddle.reshape_(imgs, [-1] + list(imgs.shape[2:]))
+        imgs = paddle.cast(imgs, "float32")  #############
+        imgs = imgs.unsqueeze(1)
+
+        if self.backbone != None:
+            feature = self.backbone(imgs)
+        else:
+            feature = imgs
+
+        if self.head != None:
+            cls_score = self.head(feature, num_segs)
+        else:
+            cls_score = None
+
+        return cls_score
+
+    def train_step(self, data_batch):
+        """Define how the model is going to train, from input to output.
+        """
+        imgs = data_batch[0]
+        labels = data_batch[1:]
+        cls_score = self.forward_net(imgs)
+        cls_score = paddle.nn.functional.sigmoid(cls_score)
+        loss_metrics = self.head.loss(cls_score, labels, if_top5=False)
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        imgs = data_batch[0]
+        labels = data_batch[1:]
+        cls_score = self.forward_net(imgs)
+        cls_score = paddle.nn.functional.sigmoid(cls_score)
+        loss_metrics = self.head.loss(cls_score,
+                                      labels,
+                                      valid_mode=True,
+                                      if_top5=False)
+        return loss_metrics
+
+    def test_step(self, data_batch):
+        """Define how the model is going to test, from input to output."""
+        # NOTE: (shipping) when testing, the net won't call head.loss, we deal with the test processing in /paddlevideo/metrics
+        imgs = data_batch[0]
+        cls_score = self.forward_net(imgs)
+        return cls_score
+
+    def infer_step(self, data_batch):
+        """Define how the model is going to test, from input to output."""
+        imgs = data_batch[0]
+        cls_score = self.forward_net(imgs)
+        return cls_score
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py
new file mode 100644
index 0000000..281c5ac
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class RecognizerGCN(BaseRecognizer):
+    """GCN Recognizer model framework.
+    """
+
+    def __init__(self,
+                 backbone=None,
+                 head=None,
+                 runtime_cfg=None,
+                 if_top5=True):
+        """
+        Args:
+            backbone (dict): Backbone modules to extract feature.
+            head (dict): Classification head to process feature.
+            is_top5 (bool): Whether to display top-5 accuracy during training/validation steps.
+        """
+        super(RecognizerGCN, self).__init__(backbone, head, runtime_cfg)
+        self.if_top5 = if_top5
+
+    def forward_net(self, data):
+        """Define how the model is going to run, from input to output.
+        """
+        feature = self.backbone(data)
+        cls_score = self.head(feature)
+        return cls_score
+
+    def train_step(self, data_batch):
+        """Training step.
+        """
+        data = data_batch[0]
+        label = data_batch[1:]
+
+        # call forward
+        cls_score = self.forward_net(data)
+        loss_metrics = self.head.loss(cls_score, label, if_top5=self.if_top5)
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        """Validating setp.
+        """
+        data = data_batch[0]
+        label = data_batch[1:]
+
+        # call forward
+        cls_score = self.forward_net(data)
+        loss_metrics = self.head.loss(cls_score,
+                                      label,
+                                      valid_mode=True,
+                                      if_top5=self.if_top5)
+        return loss_metrics
+
+    def test_step(self, data_batch):
+        """Test step.
+        """
+        data = data_batch[0]
+
+        # call forward
+        cls_score = self.forward_net(data)
+        return cls_score
+
+    def infer_step(self, data_batch):
+        """Infer step.
+        """
+        data = data_batch[0]
+
+        # call forward
+        cls_score = self.forward_net(data)
+        return cls_score
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py
new file mode 100644
index 0000000..1ad2e14
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import paddle
+
+from paddlevideo.utils import get_logger
+from .base import BaseRecognizer
+from ...registry import RECOGNIZERS
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class MoViNetRecognizerFrame(BaseRecognizer):
+
+    def forward_net(self, imgs):
+        """Define how the model is going to run, from input to output.
+        """
+        self.backbone.clean_activation_buffers()
+        outputs = self.backbone(imgs)
+        cls_score = self.head(outputs)
+        return cls_score
+
+    def train_step(self, data_batch):
+        """Training step.
+        """
+        imgs = data_batch[0]
+        labels = data_batch[1]  #.astype("int64")
+        data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4])
+        # call forward
+        cls_score = self.forward_net(data)
+        loss_metrics = self.head.loss_func(cls_score, labels)
+        top1 = paddle.metric.accuracy(input=cls_score, label=labels, k=1)
+        top5 = paddle.metric.accuracy(input=cls_score, label=labels, k=5)
+        output = {'loss': loss_metrics, 'top1': top1, 'top5': top5}
+        return output
+
+    def val_step(self, data_batch):
+        """Validating setp.
+        """
+        imgs = data_batch[0]
+        labels = data_batch[1]  #.astype("int64")
+        data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4])
+        # call forward
+        cls_score = self.forward_net(data)
+        loss_metrics = self.head.loss_func(cls_score, labels)
+        top1 = paddle.metric.accuracy(input=cls_score, label=labels, k=1)
+        top5 = paddle.metric.accuracy(input=cls_score, label=labels, k=5)
+        output = {'loss': loss_metrics, 'top1': top1, 'top5': top5}
+        return output
+
+    def test_step(self, data_batch):
+        """Test step.
+        """
+        imgs = data_batch[0]
+        data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4])
+        # call forward
+        cls_score = self.forward_net(data)
+        return cls_score
+
+    def infer_step(self, data_batch):
+        """Infer step.
+        """
+        imgs = data_batch[0]
+        # call forward
+        data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4])
+        cls_score = self.forward_net(data)
+
+        return cls_score
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py
new file mode 100644
index 0000000..4144eda
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import paddle
+import paddle.nn.functional as F
+from paddlevideo.utils import get_logger
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class RecognizerTransformer(BaseRecognizer):
+    """Transformer's recognizer model framework."""
+    def forward_net(self, imgs):
+        # imgs.shape=[N,C,T,H,W], for transformer case
+        if self.backbone is not None:
+            feature = self.backbone(imgs)
+        else:
+            feature = imgs
+
+        if self.head is not None:
+            cls_score = self.head(feature)
+        else:
+            cls_score = None
+
+        return cls_score
+
+    def train_step(self, data_batch):
+        """Define how the model is going to train, from input to output.
+        """
+        imgs = data_batch[0]
+        labels = data_batch[1:]
+        cls_score = self.forward_net(imgs)
+        loss_metrics = self.head.loss(cls_score, labels)
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        imgs = data_batch[0]
+        labels = data_batch[1:]
+        cls_score = self.forward_net(imgs)
+        loss_metrics = self.head.loss(cls_score, labels, valid_mode=True)
+        return loss_metrics
+
+    def test_step(self, data_batch):
+        """Define how the model is going to infer, from input to output."""
+        imgs = data_batch[0]
+        num_views = imgs.shape[2] // self.runtime_cfg.test.num_seg
+        cls_score = []
+        for i in range(num_views):
+            view = imgs[:, :, i * self.runtime_cfg.test.num_seg:(i + 1) *
+                        self.runtime_cfg.test.num_seg]
+            cls_score.append(self.forward_net(view))
+        cls_score = self._average_view(cls_score,
+                                       self.runtime_cfg.test.avg_type)
+        return cls_score
+
+    def infer_step(self, data_batch):
+        """Define how the model is going to infer, from input to output."""
+        imgs = data_batch[0]
+        num_views = imgs.shape[2] // self.runtime_cfg.test.num_seg
+        cls_score = []
+        for i in range(num_views):
+            view = imgs[:, :, i * self.runtime_cfg.test.num_seg:(i + 1) *
+                        self.runtime_cfg.test.num_seg]
+            cls_score.append(self.forward_net(view))
+        cls_score = self._average_view(cls_score,
+                                       self.runtime_cfg.test.avg_type)
+        return cls_score
+
+    def _average_view(self, cls_score, avg_type='score'):
+        """Combine the predicted results of different views
+
+        Args:
+            cls_score (list): results of multiple views
+            avg_type (str, optional): Average calculation method. Defaults to 'score'.
+        """
+        assert avg_type in ['score', 'prob'], \
+            f"Currently only the average of 'score' or 'prob' is supported, but got {avg_type}"
+        if avg_type == 'score':
+            return paddle.add_n(cls_score) / len(cls_score)
+        elif avg_type == 'prob':
+            return paddle.add_n(
+                [F.softmax(score, axis=-1)
+                 for score in cls_score]) / len(cls_score)
+        else:
+            raise NotImplementedError
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py
new file mode 100644
index 0000000..e8696b4
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import paddle
+import paddle.nn.functional as F
+from paddlevideo.utils import get_logger
+
+from ...registry import RECOGNIZERS
+from .base import BaseRecognizer
+
+logger = get_logger("paddlevideo")
+
+
+@RECOGNIZERS.register()
+class RecognizerTransformer_MRI(BaseRecognizer):
+    """Transformer's recognizer model framework."""
+    def forward_net(self, imgs):
+        # imgs.shape=[N,C,T,H,W], for transformer case
+
+        imgs = paddle.cast(imgs, "float32")  #############
+        imgs = imgs.unsqueeze(1)
+
+        if self.backbone != None:
+            feature = self.backbone(imgs)
+        else:
+            feature = imgs
+
+        if self.head != None:
+            cls_score = self.head(feature)
+        else:
+            cls_score = None
+
+        return cls_score
+
+    def train_step(self, data_batch):
+        """Define how the model is going to train, from input to output.
+        """
+        imgs = data_batch[0]
+        labels = data_batch[1:]
+        cls_score = self.forward_net(imgs)
+        cls_score = paddle.nn.functional.sigmoid(cls_score)
+        loss_metrics = self.head.loss(cls_score, labels, if_top5=False)
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        imgs = data_batch[0]
+        labels = data_batch[1:]
+        cls_score = self.forward_net(imgs)
+        cls_score = paddle.nn.functional.sigmoid(cls_score)
+        loss_metrics = self.head.loss(cls_score,
+                                      labels,
+                                      valid_mode=True,
+                                      if_top5=False)
+        return loss_metrics
+
+    def test_step(self, data_batch):
+        """Define how the model is going to infer, from input to output."""
+        imgs = data_batch[0]
+        num_views = imgs.shape[2] // self.backbone.seg_num
+        cls_score = []
+        for i in range(num_views):
+            view = imgs[:, :, i * self.backbone.seg_num:(i + 1) *
+                        self.backbone.seg_num]
+            cls_score.append(self.forward_net(view))
+        cls_score = self.average_view(cls_score)
+        return cls_score
+
+    def infer_step(self, data_batch):
+        """Define how the model is going to infer, from input to output."""
+        imgs = data_batch[0]
+        num_views = imgs.shape[2] // self.backbone.seg_num
+        cls_score = []
+        for i in range(num_views):
+            view = imgs[:, :, i * self.backbone.seg_num:(i + 1) *
+                        self.backbone.seg_num]
+            cls_score.append(self.forward_net(view))
+        cls_score = self.average_view(cls_score)
+        return cls_score
+
+    def average_view(self, cls_score, average_type='score'):
+        """Combine the scores of different views
+
+        Args:
+            cls_score (list): Scores of multiple views
+            average_type (str, optional): Average calculation method. Defaults to 'score'.
+        """
+        assert average_type in ['score', 'prob'], \
+            f"Currently only the average of 'score' or 'prob' is supported, but got {average_type}"
+        if average_type == 'score':
+            return paddle.add_n(cls_score) / len(cls_score)
+        elif average_type == 'avg':
+            return paddle.add_n([F.softmax(score)
+                                 for score in cls_score]) / len(cls_score)
+        else:
+            raise NotImplementedError
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__init__.py
new file mode 100644
index 0000000..28a1d2e
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from .base import BaseSegment
+from .cfbi import CFBI
+
+__all__ = ['BaseSegment', 'CFBI']
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..eadd905
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000..416bb39
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/base.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/cfbi.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/cfbi.cpython-310.pyc
new file mode 100644
index 0000000..4d91c88
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/cfbi.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/utils.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000..740259a
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/utils.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/base.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/base.py
new file mode 100644
index 0000000..0c5cb07
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/base.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from abc import abstractmethod
+from ... import builder
+import paddle.nn as nn
+
+
+class BaseSegment(nn.Layer):
+    """Base class for semi-Video Object Segmentation.
+    All subclass should overwrite:
+
+    - Methods:``train_step``, supporting to forward when training.
+    - Methods:``valid_step``, supporting to forward when validating.
+    - Methods:``test_step``, supporting to forward when testing.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        head (dict): Head to process feature.
+        loss(dict): Loss function.
+    """
+    def __init__(self, backbone=None, head=None, loss=None):
+        super().__init__()
+        if backbone is not None:
+            self.backbone = builder.build_backbone(backbone)
+            if hasattr(self.backbone, 'init_weights'):
+                self.backbone.init_weights()
+        else:
+            self.backbone = None
+        if head is not None:
+            self.head_name = head.name
+            self.head = builder.build_head(head)
+            if hasattr(self.head, 'init_weights'):
+                self.head.init_weights()
+        else:
+            self.head = None
+        if loss is not None:
+            self.loss = builder.build_loss(loss)
+        else:
+            self.loss = None
+
+    def forward(self, data_batch, mode='infer'):
+        """
+        1. Define how the model is going to run, from input to output.
+        2. Console of train, valid, test or infer step
+        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py
+        """
+        if mode == 'train':
+            return self.train_step(data_batch)
+        elif mode == 'valid':
+            return self.val_step(data_batch)
+        elif mode == 'test':
+            return self.test_step(data_batch)
+        elif mode == 'infer':
+            return self.infer_step(data_batch)
+        else:
+            raise NotImplementedError
+
+    @abstractmethod
+    def train_step(self, data_batch, **kwargs):
+        """Training step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def val_step(self, data_batch, **kwargs):
+        """Validating step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def test_step(self, data_batch, **kwargs):
+        """Test step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def infer_step(self, data_batch, **kwargs):
+        """Infer step.
+        """
+        raise NotImplementedError
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/cfbi.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/cfbi.py
new file mode 100644
index 0000000..dcdc512
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/cfbi.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+
+from .utils import foreground2background, global_matching_for_eval, local_matching, calculate_attention_head_for_eval
+from ...registry import SEGMENT
+from .base import BaseSegment
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@SEGMENT.register()
+class CFBI(BaseSegment):
+    """CFBI model framework."""
+    def __init__(self, backbone=None, head=None, loss=None):
+        super().__init__(backbone, head, loss)
+        x1 = paddle.zeros([3, 1, 1, 1])
+        self.bg_bias = paddle.create_parameter(
+            shape=x1.shape,
+            dtype=x1.dtype,
+            default_initializer=nn.initializer.Assign(x1))
+        self.fg_bias = paddle.create_parameter(
+            shape=x1.shape,
+            dtype=x1.dtype,
+            default_initializer=nn.initializer.Assign(x1))
+        self.epsilon = 1e-05
+
+    def test_step(self, data_batch):
+        """Define how the model is going to test, from input to output.
+        """
+        self.test_mode = True
+        ref_embeddings, ref_masks, prev_embedding, prev_mask, current_frame, pred_size, gt_ids = data_batch
+        current_frame_embedding_4x, current_frame_embedding_8x, current_frame_embedding_16x, \
+        current_low_level = self.backbone(current_frame)
+
+        current_frame_embedding = [
+            current_frame_embedding_4x, current_frame_embedding_8x,
+            current_frame_embedding_16x
+        ]
+
+        if prev_embedding is None:
+            return None, current_frame_embedding
+        else:
+            bs, c, h, w = current_frame_embedding_4x.shape
+
+            tmp_dic, _ = self.before_seghead_process(
+                ref_embeddings,
+                prev_embedding,
+                current_frame_embedding,
+                ref_masks,
+                prev_mask,
+                gt_ids,
+                current_low_level=current_low_level,
+            )
+            all_pred = []
+            for i in range(bs):
+                pred = tmp_dic[i]
+
+                pred = F.interpolate(pred,
+                                     size=[pred_size[0], pred_size[1]],
+                                     mode='bilinear',
+                                     align_corners=True)
+                all_pred.append(pred)
+            all_pred = paddle.concat(all_pred, axis=0)
+            all_pred = F.softmax(all_pred, axis=1)
+            return all_pred, current_frame_embedding
+
+    def before_seghead_process(self,
+                               ref_frame_embeddings=None,
+                               previous_frame_embeddings=None,
+                               current_frame_embeddings=None,
+                               ref_frame_labels=None,
+                               previous_frame_mask=None,
+                               gt_ids=None,
+                               current_low_level=None):
+        """ process befor segmentation head"""
+        TEST_GLOBAL_MATCHING_CHUNK = [4, 1, 1]
+        TEST_GLOBAL_ATROUS_RATE = [2, 1, 1]
+        TRAIN_LOCAL_ATROUS_RATE = [2, 1, 1]
+        TEST_LOCAL_ATROUS_RATE = [2, 1, 1]
+        MODEL_FLOAT16_MATCHING = False
+        TEST_GLOBAL_MATCHING_MIN_PIXEL = 100
+        MODEL_MULTI_LOCAL_DISTANCE = [[4, 8, 12, 16, 20, 24],
+                                      [2, 4, 6, 8, 10, 12], [2, 4, 6, 8, 10]]
+        TRAIN_LOCAL_PARALLEL = True
+        TEST_LOCAL_PARALLEL = True
+        MODEL_MATCHING_BACKGROUND = True
+        MODEL_SEMANTIC_MATCHING_DIM = [32, 64, 128]
+
+        dic_tmp = []
+        boards = {}
+        scale_ref_frame_labels = []
+        scale_previous_frame_labels = []
+        for current_frame_embedding in current_frame_embeddings:
+            bs, c, h, w = current_frame_embedding.shape
+            if not self.test_mode:
+                raise NotImplementedError
+            else:
+                ref_frame_embeddings = list(zip(*ref_frame_embeddings))
+                all_scale_ref_frame_label = []
+                for ref_frame_label in ref_frame_labels:
+                    scale_ref_frame_label = paddle.cast(F.interpolate(
+                        paddle.cast(ref_frame_label, dtype="float32"),
+                        size=(h, w),
+                        mode='nearest'),
+                                                        dtype="int32")
+                    all_scale_ref_frame_label.append(scale_ref_frame_label)
+                scale_ref_frame_labels.append(all_scale_ref_frame_label)
+            scale_previous_frame_label = paddle.cast(F.interpolate(
+                paddle.cast(previous_frame_mask, dtype="float32"),
+                size=(h, w),
+                mode='nearest'),
+                                                     dtype="int32")
+            scale_previous_frame_labels.append(scale_previous_frame_label)
+        for n in range(bs):
+            ref_obj_ids = paddle.reshape(
+                paddle.cast(paddle.arange(0,
+                                          np.array(gt_ids)[n] + 1),
+                            dtype="int32"), [-1, 1, 1, 1])
+            obj_num = ref_obj_ids.shape[0]
+            low_level_feat = paddle.unsqueeze(current_low_level[n], axis=0)
+            all_CE_input = []
+            all_attention_head = []
+            for scale_idx, current_frame_embedding, ref_frame_embedding, previous_frame_embedding, \
+                scale_ref_frame_label, scale_previous_frame_label in zip(range(3), \
+                    current_frame_embeddings, ref_frame_embeddings, previous_frame_embeddings, \
+                    scale_ref_frame_labels, scale_previous_frame_labels):
+                #Prepare
+                seq_current_frame_embedding = current_frame_embedding[n]
+                seq_prev_frame_embedding = previous_frame_embedding[n]
+                seq_previous_frame_label = paddle.cast(
+                    (paddle.cast(scale_previous_frame_label[n], dtype="int32")
+                     == ref_obj_ids),
+                    dtype="float32")
+                if np.array(gt_ids)[n] > 0:
+                    dis_bias = paddle.concat([
+                        paddle.unsqueeze(self.bg_bias[scale_idx], axis=0),
+                        paddle.expand(
+                            paddle.unsqueeze(self.fg_bias[scale_idx], axis=0),
+                            [np.array(gt_ids)[n], -1, -1, -1])
+                    ],
+                                             axis=0)
+                else:
+                    dis_bias = paddle.unsqueeze(self.bg_bias[scale_idx], axis=0)
+                #Global FG map
+                matching_dim = MODEL_SEMANTIC_MATCHING_DIM[scale_idx]
+                seq_current_frame_embedding_for_matching = paddle.transpose(
+                    seq_current_frame_embedding[:matching_dim], [1, 2, 0])
+
+                if not self.test_mode:
+                    raise NotImplementedError
+                else:
+                    all_scale_ref_frame_label = scale_ref_frame_label
+                    all_ref_frame_embedding = ref_frame_embedding
+                    all_reference_embeddings = []
+                    all_reference_labels = []
+                    seq_ref_frame_labels = []
+                    count = 0
+                    for idx in range(len(all_scale_ref_frame_label)):
+
+                        ref_frame_embedding = all_ref_frame_embedding[idx]
+                        scale_ref_frame_label = all_scale_ref_frame_label[idx]
+
+                        seq_ref_frame_embedding = ref_frame_embedding[n]
+                        seq_ref_frame_embedding = paddle.transpose(
+                            seq_ref_frame_embedding, [1, 2, 0])
+                        seq_ref_frame_label = paddle.cast(
+                            (paddle.cast(scale_ref_frame_label[n],
+                                         dtype="int32") == ref_obj_ids),
+                            dtype="float32")
+                        seq_ref_frame_labels.append(seq_ref_frame_label)
+                        seq_ref_frame_label = paddle.transpose(
+                            paddle.squeeze(seq_ref_frame_label, axis=1),
+                            [1, 2, 0])
+                        all_reference_embeddings.append(
+                            seq_ref_frame_embedding[:, :, :matching_dim])
+                        all_reference_labels.append(seq_ref_frame_label)
+                    global_matching_fg = global_matching_for_eval(
+                        all_reference_embeddings=all_reference_embeddings,
+                        query_embeddings=
+                        seq_current_frame_embedding_for_matching,
+                        all_reference_labels=all_reference_labels,
+                        n_chunks=TEST_GLOBAL_MATCHING_CHUNK[scale_idx],
+                        dis_bias=dis_bias,
+                        atrous_rate=TEST_GLOBAL_ATROUS_RATE[scale_idx],
+                        use_float16=MODEL_FLOAT16_MATCHING,
+                        atrous_obj_pixel_num=TEST_GLOBAL_MATCHING_MIN_PIXEL)
+
+                # Local FG map
+                seq_prev_frame_embedding_for_matching = paddle.transpose(
+                    seq_prev_frame_embedding[:matching_dim], [1, 2, 0])
+                seq_previous_frame_label_for_matching = paddle.transpose(
+                    paddle.squeeze(seq_previous_frame_label, axis=1), [1, 2, 0])
+                local_matching_fg = local_matching(
+                    prev_frame_embedding=seq_prev_frame_embedding_for_matching,
+                    query_embedding=seq_current_frame_embedding_for_matching,
+                    prev_frame_labels=seq_previous_frame_label_for_matching,
+                    multi_local_distance=MODEL_MULTI_LOCAL_DISTANCE[scale_idx],
+                    dis_bias=dis_bias,
+                    atrous_rate=TRAIN_LOCAL_ATROUS_RATE[scale_idx] if
+                    not self.test_mode else TEST_LOCAL_ATROUS_RATE[scale_idx],
+                    use_float16=MODEL_FLOAT16_MATCHING,
+                    allow_downsample=False,
+                    allow_parallel=TRAIN_LOCAL_PARALLEL
+                    if not self.test_mode else TEST_LOCAL_PARALLEL)
+
+                #Aggregate Pixel-level Matching
+                to_cat_global_matching_fg = paddle.transpose(
+                    paddle.squeeze(global_matching_fg, axis=0), [2, 3, 0, 1])
+                to_cat_local_matching_fg = paddle.transpose(
+                    paddle.squeeze(local_matching_fg, axis=0), [2, 3, 0, 1])
+                all_to_cat = [
+                    to_cat_global_matching_fg, to_cat_local_matching_fg,
+                    seq_previous_frame_label
+                ]
+
+                #Global and Local BG map
+                if MODEL_MATCHING_BACKGROUND:
+                    to_cat_global_matching_bg = foreground2background(
+                        to_cat_global_matching_fg,
+                        np.array(gt_ids)[n] + 1)
+                    reshaped_prev_nn_feature_n = paddle.unsqueeze(
+                        paddle.transpose(to_cat_local_matching_fg,
+                                         [0, 2, 3, 1]),
+                        axis=1)
+                    to_cat_local_matching_bg = foreground2background(
+                        reshaped_prev_nn_feature_n,
+                        np.array(gt_ids)[n] + 1)
+                    to_cat_local_matching_bg = paddle.squeeze(paddle.transpose(
+                        to_cat_local_matching_bg, [0, 4, 2, 3, 1]),
+                                                              axis=-1)
+                    all_to_cat += [
+                        to_cat_local_matching_bg, to_cat_global_matching_bg
+                    ]
+
+                to_cat_current_frame_embedding = paddle.expand(
+                    paddle.unsqueeze(current_frame_embedding[n], axis=0),
+                    [obj_num, -1, -1, -1])
+                to_cat_prev_frame_embedding = paddle.expand(
+                    paddle.unsqueeze(previous_frame_embedding[n], axis=0),
+                    [obj_num, -1, -1, -1])
+                to_cat_prev_frame_embedding_fg = to_cat_prev_frame_embedding * seq_previous_frame_label
+                to_cat_prev_frame_embedding_bg = to_cat_prev_frame_embedding * (
+                    1 - seq_previous_frame_label)
+                all_to_cat += [
+                    to_cat_current_frame_embedding,
+                    to_cat_prev_frame_embedding_fg,
+                    to_cat_prev_frame_embedding_bg
+                ]
+
+                CE_input = paddle.concat(all_to_cat, axis=1)
+                #Instance-level Attention
+                if not self.test_mode:
+                    raise NotImplementedError
+                else:
+                    attention_head = calculate_attention_head_for_eval(
+                        all_ref_frame_embedding,
+                        seq_ref_frame_labels,
+                        paddle.expand(
+                            paddle.unsqueeze(previous_frame_embedding[n],
+                                             axis=0), [obj_num, -1, -1, -1]),
+                        seq_previous_frame_label,
+                        epsilon=self.epsilon)
+
+                all_CE_input.append(CE_input)
+                all_attention_head.append(attention_head)
+
+            #Collaborative Ensembler
+            pred = self.head(all_CE_input, all_attention_head, low_level_feat)
+            dic_tmp.append(pred)
+
+        return dic_tmp, boards
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/utils.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/utils.py
new file mode 100644
index 0000000..1ec3be4
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/utils.py
@@ -0,0 +1,754 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+def foreground2background(dis, obj_num):
+    if obj_num == 1:
+        return dis
+    bg_dis = []
+    for i in range(obj_num):
+        obj_back = []
+        for j in range(obj_num):
+            if i == j:
+                continue
+            obj_back.append(paddle.unsqueeze(dis[j], axis=0))
+        obj_back = paddle.concat(x=obj_back, axis=1)
+        obj_back = paddle.min(x=obj_back, axis=1, keepdim=True)
+        bg_dis.append(obj_back)
+    bg_dis = paddle.concat(x=bg_dis, axis=0)
+    return bg_dis
+
+
+WRONG_LABEL_PADDING_DISTANCE = 5e4
+
+
+#GLOBAL_DIST_MAP
+def _pairwise_distances(x, x2, y, y2):
+    """
+    Computes pairwise squared l2 distances between tensors x and y.
+    Args:
+    x: [n, feature_dim].
+    y: [m, feature_dim].
+    Returns:
+    d: [n, m].
+    """
+    xs = x2
+    ys = y2
+
+    xs = paddle.unsqueeze(xs, axis=1)
+    ys = paddle.unsqueeze(ys, axis=0)
+    d = xs + ys - 2. * paddle.matmul(x, y, transpose_y=True)
+    return d
+
+
+def _flattened_pairwise_distances(reference_embeddings, ref_square,
+                                  query_embeddings, query_square):
+    """
+    Calculates flattened tensor of pairwise distances between ref and query.
+    Args:
+        reference_embeddings: [..., embedding_dim],
+          the embedding vectors for the reference frame
+        query_embeddings: [..., embedding_dim],
+          the embedding vectors for the query frames.
+    Returns:
+        dists: [reference_embeddings.size / embedding_dim, query_embeddings.size / embedding_dim]
+    """
+    dists = _pairwise_distances(query_embeddings, query_square,
+                                reference_embeddings, ref_square)
+    return dists
+
+
+def _nn_features_per_object_for_chunk(reference_embeddings, ref_square,
+                                      query_embeddings, query_square,
+                                      wrong_label_mask):
+    """Extracts features for each object using nearest neighbor attention.
+    Args:
+        reference_embeddings: [n_chunk, embedding_dim],
+          the embedding vectors for the reference frame.
+        query_embeddings: [m_chunk, embedding_dim],
+          the embedding vectors for the query frames.
+        wrong_label_mask: [n_objects, n_chunk],
+          the mask for pixels not used for matching.
+    Returns:
+        nn_features: A float32 tensor of nearest neighbor features of shape
+          [m_chunk, n_objects, n_chunk].
+    """
+    if reference_embeddings.dtype == "float16":
+        wrong_label_mask = paddle.cast(wrong_label_mask, dtype="float16")
+    else:
+        wrong_label_mask = paddle.cast(wrong_label_mask, dtype="float32")
+
+    reference_embeddings_key = reference_embeddings
+    query_embeddings_key = query_embeddings
+    dists = _flattened_pairwise_distances(reference_embeddings_key, ref_square,
+                                          query_embeddings_key, query_square)
+    dists = (paddle.unsqueeze(dists, axis=1) +
+             paddle.unsqueeze(wrong_label_mask, axis=0) *
+             WRONG_LABEL_PADDING_DISTANCE)
+    features = paddle.min(dists, axis=2, keepdim=True)
+    return features
+
+
+def _nearest_neighbor_features_per_object_in_chunks(reference_embeddings_flat,
+                                                    query_embeddings_flat,
+                                                    reference_labels_flat,
+                                                    n_chunks):
+    """Calculates the nearest neighbor features per object in chunks to save mem.
+    Uses chunking to bound the memory use.
+    Args:
+        reference_embeddings_flat: [n, embedding_dim],
+          the embedding vectors for the reference frame.
+        query_embeddings_flat: [m, embedding_dim],
+          the embedding vectors for the query frames.
+        reference_labels_flat: [n, n_objects],
+          the class labels of the reference frame.
+        n_chunks: Integer, the number of chunks to use to save memory
+          (set to 1 for no chunking).
+    Returns:
+        nn_features: [m, n_objects, n].
+    """
+
+    feature_dim, embedding_dim = query_embeddings_flat.shape
+    chunk_size = int(np.ceil(float(feature_dim) / n_chunks))
+    wrong_label_mask = reference_labels_flat < 0.1
+
+    wrong_label_mask = paddle.transpose(x=wrong_label_mask, perm=[1, 0])
+    ref_square = paddle.sum(paddle.pow(reference_embeddings_flat, 2), axis=1)
+    query_square = paddle.sum(paddle.pow(query_embeddings_flat, 2), axis=1)
+
+    all_features = []
+    for n in range(n_chunks):
+        if n_chunks == 1:
+            query_embeddings_flat_chunk = query_embeddings_flat
+            query_square_chunk = query_square
+            chunk_start = 0
+        else:
+            chunk_start = n * chunk_size
+            chunk_end = (n + 1) * chunk_size
+            query_square_chunk = query_square[chunk_start:chunk_end]
+            if query_square_chunk.shape[0] == 0:
+                continue
+            query_embeddings_flat_chunk = query_embeddings_flat[
+                chunk_start:chunk_end]
+        features = _nn_features_per_object_for_chunk(
+            reference_embeddings_flat, ref_square, query_embeddings_flat_chunk,
+            query_square_chunk, wrong_label_mask)
+        all_features.append(features)
+    if n_chunks == 1:
+        nn_features = all_features[0]
+    else:
+        nn_features = paddle.concat(all_features, axis=0)
+
+    return nn_features
+
+
+def global_matching(reference_embeddings,
+                    query_embeddings,
+                    reference_labels,
+                    n_chunks=100,
+                    dis_bias=0.,
+                    ori_size=None,
+                    atrous_rate=1,
+                    use_float16=True,
+                    atrous_obj_pixel_num=0):
+    """
+    Calculates the distance to the nearest neighbor per object.
+    For every pixel of query_embeddings calculate the distance to the
+    nearest neighbor in the (possibly subsampled) reference_embeddings per object.
+    Args:
+        reference_embeddings: [height, width, embedding_dim],
+          the embedding vectors for the reference frame.
+        query_embeddings: [height, width,
+          embedding_dim], the embedding vectors for the query frames.
+        reference_labels: [height, width, obj_nums],
+          the class labels of the reference frame.
+        n_chunks: Integer, the number of chunks to use to save memory
+          (set to 1 for no chunking).
+        dis_bias: [n_objects], foreground and background bias
+        ori_size: (ori_height, ori_width),
+          the original spatial size. If "None", (ori_height, ori_width) = (height, width).
+        atrous_rate: Integer, the atrous rate of reference_embeddings.
+        use_float16: Bool, if "True", use float16 type for matching.
+    Returns:
+        nn_features: [1, ori_height, ori_width, n_objects, feature_dim].
+    """
+
+    assert (reference_embeddings.shape[:2] == reference_labels.shape[:2])
+    if use_float16:
+        query_embeddings = paddle.cast(query_embeddings, dtype="float16")
+        reference_embeddings = paddle.cast(reference_embeddings,
+                                           dtype="float16")
+    h, w, embedding_dim = query_embeddings.shape
+    obj_nums = reference_labels.shape[2]
+
+    if atrous_rate > 1:
+        h_pad = (atrous_rate - h % atrous_rate) % atrous_rate
+        w_pad = (atrous_rate - w % atrous_rate) % atrous_rate
+        selected_points = paddle.zeros([h + h_pad, w + w_pad])
+        selected_points = selected_points.view(
+            (h + h_pad) // atrous_rate, atrous_rate, (w + w_pad) // atrous_rate,
+            atrous_rate)
+        selected_points[:, 0, :, 0] = 1.
+        selected_points = paddle.reshape(selected_points,
+                                         [h + h_pad, w + w_pad, 1])[:h, :w]
+        is_big_obj = (paddle.sum(
+            reference_labels,
+            axis=(0, 1))) > (atrous_obj_pixel_num * atrous_rate**2)
+        reference_labels[:, :,
+                         is_big_obj] = reference_labels[:, :,
+                                                        is_big_obj] * selected_points
+
+    reference_embeddings_flat = paddle.reshape(reference_embeddings,
+                                               [-1, embedding_dim])
+    reference_labels_flat = paddle.reshape(reference_labels, [-1, obj_nums])
+    query_embeddings_flat = paddle.reshape(query_embeddings,
+                                           [-1, embedding_dim])
+
+    all_ref_fg = paddle.sum(reference_labels_flat, axis=1, keepdim=True) > 0.9
+    reference_labels_flat = paddle.reshape(
+        paddle.masked_select(reference_labels_flat,
+                             paddle.expand(all_ref_fg, [-1, obj_nums])),
+        [-1, obj_nums])
+    if reference_labels_flat.shape[0] == 0:
+        return paddle.ones([1, h, w, obj_nums, 1])
+    reference_embeddings_flat = paddle.reshape(
+        paddle.masked_select(reference_embeddings_flat,
+                             paddle.expand(all_ref_fg, [-1, embedding_dim])),
+        [-1, embedding_dim])
+
+    nn_features = _nearest_neighbor_features_per_object_in_chunks(
+        reference_embeddings_flat, query_embeddings_flat, reference_labels_flat,
+        n_chunks)
+
+    nn_features_reshape = paddle.reshape(nn_features, [1, h, w, obj_nums, 1])
+    nn_features_reshape = (
+        F.sigmoid(nn_features_reshape +
+                  paddle.reshape(dis_bias, [1, 1, 1, -1, 1])) - 0.5) * 2
+
+    #TODO: ori_size is not None
+
+    if use_float16:
+        nn_features_reshape = paddle.cast(nn_features_reshape, dtype="float32")
+    return nn_features_reshape
+
+
+def global_matching_for_eval(all_reference_embeddings,
+                             query_embeddings,
+                             all_reference_labels,
+                             n_chunks=20,
+                             dis_bias=0.,
+                             ori_size=None,
+                             atrous_rate=1,
+                             use_float16=True,
+                             atrous_obj_pixel_num=0):
+    """
+    Calculates the distance to the nearest neighbor per object.
+    For every pixel of query_embeddings calculate the distance to the
+    nearest neighbor in the (possibly subsampled) reference_embeddings per object.
+    Args:
+        all_reference_embeddings: A list of reference_embeddings,
+          each with size [height, width, embedding_dim],
+          the embedding vectors for the reference frame.
+        query_embeddings: [n_query_images, height, width,
+          embedding_dim], the embedding vectors for the query frames.
+        all_reference_labels: A list of reference_labels,
+          each with size [height, width, obj_nums],
+          the class labels of the reference frame.
+        n_chunks: Integer, the number of chunks to use to save memory
+          (set to 1 for no chunking).
+        dis_bias: [n_objects], foreground and background bias
+        ori_size: (ori_height, ori_width),
+          the original spatial size. If "None", (ori_height, ori_width) = (height, width).
+        atrous_rate: Integer, the atrous rate of reference_embeddings.
+        use_float16: Bool, if "True", use float16 type for matching.
+    Returns:
+        nn_features: [n_query_images, ori_height, ori_width, n_objects, feature_dim].
+    """
+
+    h, w, embedding_dim = query_embeddings.shape
+    obj_nums = all_reference_labels[0].shape[2]
+    all_reference_embeddings_flat = []
+    all_reference_labels_flat = []
+    ref_num = len(all_reference_labels)
+    n_chunks *= ref_num
+    if atrous_obj_pixel_num > 0:
+        if atrous_rate > 1:
+            h_pad = (atrous_rate - h % atrous_rate) % atrous_rate
+            w_pad = (atrous_rate - w % atrous_rate) % atrous_rate
+            selected_points = paddle.zeros([h + h_pad, w + w_pad])
+            selected_points = paddle.reshape(
+                selected_points, [(h + h_pad) // atrous_rate, atrous_rate,
+                                  (w + w_pad) // atrous_rate, atrous_rate])
+            selected_points[:, 0, :, 0] = 1.
+            selected_points = paddle.reshape(selected_points,
+                                             [h + h_pad, w + w_pad, 1])[:h, :w]
+
+        for reference_embeddings, reference_labels, idx in zip(
+                all_reference_embeddings, all_reference_labels, range(ref_num)):
+            if atrous_rate > 1:
+                is_big_obj = paddle.sum(
+                    reference_labels,
+                    axis=(0, 1)) > (atrous_obj_pixel_num * atrous_rate**2)
+                is_big_obj = list(np.array(is_big_obj))
+                for j in range(len(is_big_obj)):
+                    if is_big_obj[j] == True:
+                        reference_labels[:, :, j:j +
+                                         1] = reference_labels[:, :, j:j +
+                                                               1] * selected_points
+
+            reference_embeddings_flat = paddle.reshape(reference_embeddings,
+                                                       [-1, embedding_dim])
+            reference_labels_flat = paddle.reshape(reference_labels,
+                                                   [-1, obj_nums])
+
+            all_reference_embeddings_flat.append(reference_embeddings_flat)
+            all_reference_labels_flat.append(reference_labels_flat)
+
+        reference_embeddings_flat = paddle.concat(
+            x=all_reference_embeddings_flat, axis=0)
+        reference_labels_flat = paddle.concat(x=all_reference_labels_flat,
+                                              axis=0)
+    else:
+        if ref_num == 1:
+            reference_embeddings, reference_labels = all_reference_embeddings[
+                0], all_reference_labels[0]
+            if atrous_rate > 1:
+                h_pad = (atrous_rate - h % atrous_rate) % atrous_rate
+                w_pad = (atrous_rate - w % atrous_rate) % atrous_rate
+                if h_pad > 0 or w_pad > 0:
+                    reference_embeddings = F.pad(reference_embeddings,
+                                                 [0, h_pad, 0, w_pad, 0, 0])
+                    reference_labels = F.pad(reference_labels,
+                                             [0, h_pad, 0, w_pad, 0, 0])
+                reference_embeddings = paddle.reshape(
+                    reference_embeddings,
+                    [(h + h_pad) // atrous_rate, atrous_rate,
+                     (w + w_pad) // atrous_rate, atrous_rate, 32])
+                reference_labels = paddle.reshape(
+                    reference_labels,
+                    [(h + h_pad) // atrous_rate, atrous_rate,
+                     (w + w_pad) // atrous_rate, atrous_rate, -1])
+                reference_embeddings = paddle.reshape(
+                    reference_embeddings[:, 0, :, 0, :],
+                    reference_embeddings[:, 0, :, 0, :].shape)
+                reference_labels = paddle.reshape(
+                    reference_labels[:, 0, :, 0, :],
+                    reference_labels[:, 0, :, 0, :].shape)
+            reference_embeddings_flat = paddle.reshape(reference_embeddings,
+                                                       [-1, embedding_dim])
+            reference_labels_flat = paddle.reshape(reference_labels,
+                                                   [-1, obj_nums])
+        else:
+            for reference_embeddings, reference_labels, idx in zip(
+                    all_reference_embeddings, all_reference_labels,
+                    range(ref_num)):
+                if atrous_rate > 1:
+                    h_pad = (atrous_rate - h % atrous_rate) % atrous_rate
+                    w_pad = (atrous_rate - w % atrous_rate) % atrous_rate
+                    if h_pad > 0 or w_pad > 0:
+                        reference_embeddings = F.pad(reference_embeddings,
+                                                     [0, h_pad, 0, w_pad, 0, 0])
+                        reference_labels = F.pad(reference_labels,
+                                                 [0, h_pad, 0, w_pad, 0, 0])
+
+                    reference_embeddings = paddle.reshape(
+                        reference_embeddings,
+                        [(h + h_pad) // atrous_rate, atrous_rate,
+                         (w + w_pad) // atrous_rate, atrous_rate, -1])
+                    reference_labels = paddle.reshape(
+                        reference_labels,
+                        [(h + h_pad) // atrous_rate, atrous_rate,
+                         (w + w_pad) // atrous_rate, atrous_rate, -1])
+                    reference_embeddings = paddle.reshape(
+                        reference_embeddings[:, 0, :, 0, :],
+                        reference_embeddings[:, 0, :, 0, :].shape)
+                    reference_labels = paddle.reshape(
+                        reference_labels[:, 0, :, 0, :],
+                        reference_labels[:, 0, :, 0, :].shape)
+
+                reference_embeddings_flat = paddle.reshape(
+                    reference_embeddings, [-1, embedding_dim])
+                reference_labels_flat = paddle.reshape(reference_labels,
+                                                       [-1, obj_nums])
+
+                all_reference_embeddings_flat.append(reference_embeddings_flat)
+                all_reference_labels_flat.append(reference_labels_flat)
+
+            reference_embeddings_flat = paddle.concat(
+                all_reference_embeddings_flat, axis=0)
+            reference_labels_flat = paddle.concat(all_reference_labels_flat,
+                                                  axis=0)
+
+    query_embeddings_flat = paddle.reshape(query_embeddings,
+                                           [-1, embedding_dim])
+
+    all_ref_fg = paddle.sum(reference_labels_flat, axis=1, keepdim=True) > 0.9
+    reference_labels_flat = paddle.reshape(
+        paddle.masked_select(reference_labels_flat,
+                             paddle.expand(all_ref_fg, [-1, obj_nums])),
+        [-1, obj_nums])
+    if reference_labels_flat.shape[0] == 0:
+        return paddle.ones([1, h, w, obj_nums, 1])
+    reference_embeddings_flat = paddle.reshape(
+        paddle.masked_select(reference_embeddings_flat,
+                             paddle.expand(all_ref_fg, [-1, embedding_dim])),
+        [-1, embedding_dim])
+    if use_float16:
+        query_embeddings_flat = paddle.cast(query_embeddings_flat,
+                                            dtype="float16")
+        reference_embeddings_flat = paddle.cast(reference_embeddings_flat,
+                                                dtype="float16")
+    nn_features = _nearest_neighbor_features_per_object_in_chunks(
+        reference_embeddings_flat, query_embeddings_flat, reference_labels_flat,
+        n_chunks)
+
+    nn_features_reshape = paddle.reshape(nn_features, [1, h, w, obj_nums, 1])
+    nn_features_reshape = (
+        F.sigmoid(nn_features_reshape +
+                  paddle.reshape(dis_bias, [1, 1, 1, -1, 1])) - 0.5) * 2
+
+    # TODO: ori_size is not None
+
+    if use_float16:
+        nn_features_reshape = paddle.cast(nn_features_reshape, dtype="float32")
+    return nn_features_reshape
+
+
+#LOCAL_DIST_MAP
+def local_pairwise_distances(x,
+                             y,
+                             max_distance=9,
+                             atrous_rate=1,
+                             allow_downsample=False):
+    """Computes pairwise squared l2 distances using a local search window.
+        Use for-loop for saving memory.
+    Args:
+        x: Float32 tensor of shape [height, width, feature_dim].
+        y: Float32 tensor of shape [height, width, feature_dim].
+        max_distance: Integer, the maximum distance in pixel coordinates
+          per dimension which is considered to be in the search window.
+        atrous_rate: Integer, the atrous rate of local matching.
+        allow_downsample: Bool, if "True", downsample x and y
+          with a stride of 2.
+    Returns:
+        Float32 distances tensor of shape [height, width, (2 * max_distance + 1) ** 2].
+    """
+    if allow_downsample:
+        ori_height = x.shape[0]
+        ori_width = x.shape[1]
+        x = paddle.unsqueeze(paddle.transpose(x, [2, 0, 1]), axis=0)
+        y = paddle.unsqueeze(paddle.transpose(y, [2, 0, 1]), axis=0)
+        down_size = (int(ori_height / 2) + 1, int(ori_width / 2) + 1)
+        x = F.interpolate(x,
+                          size=down_size,
+                          mode='bilinear',
+                          align_corners=True)
+        y = F.interpolate(y,
+                          size=down_size,
+                          mode='bilinear',
+                          align_corners=True)
+        x = paddle.unsqueeze(paddle.transpose(x, [1, 2, 0]), axis=0)
+        y = paddle.unsqueeze(paddle.transpose(y, [1, 2, 0]), axis=0)
+
+    pad_max_distance = max_distance - max_distance % atrous_rate
+    # no change pad
+    padded_y = F.pad(y, (0, 0, pad_max_distance, pad_max_distance,
+                         pad_max_distance, pad_max_distance),
+                     value=WRONG_LABEL_PADDING_DISTANCE)
+
+    height, width, _ = x.shape
+    dists = []
+    for y in range(2 * pad_max_distance // atrous_rate + 1):
+        y_start = y * atrous_rate
+        y_end = y_start + height
+        y_slice = padded_y[y_start:y_end]
+        for x in range(2 * max_distance + 1):
+            x_start = x * atrous_rate
+            x_end = x_start + width
+            offset_y = y_slice[:, x_start:x_end]
+            dist = paddle.sum(paddle.pow((x - offset_y), 2), axis=2)
+            dists.append(dist)
+    dists = paddle.stack(dists, axis=2)
+
+    return dists
+
+
+def local_pairwise_distances_parallel(x,
+                                      y,
+                                      max_distance=9,
+                                      atrous_rate=1,
+                                      allow_downsample=True):
+    """Computes pairwise squared l2 distances using a local search window.
+    Args:
+        x: Float32 tensor of shape [height, width, feature_dim].
+        y: Float32 tensor of shape [height, width, feature_dim].
+        max_distance: Integer, the maximum distance in pixel coordinates
+          per dimension which is considered to be in the search window.
+        atrous_rate: Integer, the atrous rate of local matching.
+        allow_downsample: Bool, if "True", downsample x and y
+          with a stride of 2.
+    Returns:
+        Float32 distances tensor of shape [height, width, (2 * max_distance + 1) ** 2].
+    """
+
+    ori_height, ori_width, _ = x.shape
+    x = paddle.unsqueeze(paddle.transpose(x, [2, 0, 1]), axis=0)
+    y = paddle.unsqueeze(paddle.transpose(y, [2, 0, 1]), axis=0)
+    if allow_downsample:
+        down_size = (int(ori_height / 2) + 1, int(ori_width / 2) + 1)
+        x = F.interpolate(x,
+                          size=down_size,
+                          mode='bilinear',
+                          align_corners=True)
+        y = F.interpolate(y,
+                          size=down_size,
+                          mode='bilinear',
+                          align_corners=True)
+
+    _, channels, height, width = x.shape
+
+    x2 = paddle.reshape(paddle.sum(paddle.pow(x, 2), axis=1),
+                        [height, width, 1])
+    y2 = paddle.reshape(paddle.sum(paddle.pow(y, 2), axis=1),
+                        [1, 1, height, width])
+
+    pad_max_distance = max_distance - max_distance % atrous_rate
+    # no change pad
+    padded_y = F.pad(y, (pad_max_distance, pad_max_distance, pad_max_distance,
+                         pad_max_distance))
+    padded_y2 = F.pad(y2, (pad_max_distance, pad_max_distance, pad_max_distance,
+                           pad_max_distance),
+                      value=WRONG_LABEL_PADDING_DISTANCE)
+
+    offset_y = paddle.transpose(
+        paddle.reshape(
+            F.unfold(x=padded_y,
+                     kernel_sizes=[height, width],
+                     strides=[atrous_rate, atrous_rate]),
+            [channels, height * width, -1]), [1, 0, 2])
+    offset_y2 = paddle.reshape(
+        F.unfold(padded_y2,
+                 kernel_sizes=[height, width],
+                 strides=[atrous_rate, atrous_rate]), [height, width, -1])
+    x = paddle.transpose(paddle.reshape(x, [channels, height * width, -1]),
+                         [1, 2, 0])
+
+    dists = x2 + offset_y2 - 2. * paddle.reshape(paddle.matmul(x, offset_y),
+                                                 [height, width, -1])
+
+    return dists
+
+
+def local_matching(prev_frame_embedding,
+                   query_embedding,
+                   prev_frame_labels,
+                   dis_bias=0.,
+                   multi_local_distance=[15],
+                   ori_size=None,
+                   atrous_rate=1,
+                   use_float16=True,
+                   allow_downsample=True,
+                   allow_parallel=True):
+    """Computes nearest neighbor features while only allowing local matches.
+    Args:
+        prev_frame_embedding: [height, width, embedding_dim],
+          the embedding vectors for the last frame.
+        query_embedding: [height, width, embedding_dim],
+          the embedding vectors for the query frames.
+        prev_frame_labels: [height, width, n_objects],
+        the class labels of the previous frame.
+        multi_local_distance: A list of Integer,
+          a list of maximum distance allowed for local matching.
+        ori_size: (ori_height, ori_width),
+          the original spatial size. If "None", (ori_height, ori_width) = (height, width).
+        atrous_rate: Integer, the atrous rate of local matching.
+        use_float16: Bool, if "True", use float16 type for matching.
+        allow_downsample: Bool, if "True", downsample prev_frame_embedding and query_embedding
+          with a stride of 2.
+        allow_parallel: Bool, if "True", do matching in a parallel way. If "False", do matching in
+          a for-loop way, which will save GPU memory.
+    Returns:
+        nn_features: A float32 np.array of nearest neighbor features of shape
+          [1, height, width, n_objects, 1].
+    """
+    max_distance = multi_local_distance[-1]
+
+    if ori_size is None:
+        height, width = prev_frame_embedding.shape[:2]
+        ori_size = (height, width)
+
+    obj_num = prev_frame_labels.shape[2]
+    pad = paddle.ones([1]) * WRONG_LABEL_PADDING_DISTANCE
+    if use_float16:
+        query_embedding = paddle.cast(query_embedding, dtype="float16")
+        prev_frame_embedding = paddle.cast(prev_frame_embedding,
+                                           dtype="float16")
+        pad = paddle.cast(pad, dtype="float16")
+
+    if allow_parallel:
+        d = local_pairwise_distances_parallel(query_embedding,
+                                              prev_frame_embedding,
+                                              max_distance=max_distance,
+                                              atrous_rate=atrous_rate,
+                                              allow_downsample=allow_downsample)
+    else:
+        d = local_pairwise_distances(query_embedding,
+                                     prev_frame_embedding,
+                                     max_distance=max_distance,
+                                     atrous_rate=atrous_rate,
+                                     allow_downsample=allow_downsample)
+
+    height, width = d.shape[:2]
+
+    labels = paddle.unsqueeze(paddle.transpose(prev_frame_labels, [2, 0, 1]), 1)
+    labels = paddle.unsqueeze(paddle.transpose(prev_frame_labels, [2, 0, 1]),
+                              axis=1)
+    if (height, width) != ori_size:
+        labels = F.interpolate(labels, size=(height, width), mode='nearest')
+
+    pad_max_distance = max_distance - max_distance % atrous_rate
+    atrous_max_distance = pad_max_distance // atrous_rate
+    #no change pad
+    padded_labels = F.pad(labels, (
+        pad_max_distance,
+        pad_max_distance,
+        pad_max_distance,
+        pad_max_distance,
+    ),
+                          mode='constant',
+                          value=0)
+
+    offset_masks = paddle.transpose(
+        paddle.reshape(
+            F.unfold(padded_labels,
+                     kernel_sizes=[height, width],
+                     strides=[atrous_rate, atrous_rate]),
+            [obj_num, height, width, -1]), [1, 2, 3, 0]) > 0.9
+
+    d_tiled = paddle.expand(paddle.unsqueeze(
+        d, axis=-1), [-1, -1, -1, obj_num])  # h, w, num_local_pos, obj_num
+
+    d_masked = paddle.where(offset_masks, d_tiled, pad)
+    dists = paddle.min(d_masked, axis=2)
+    multi_dists = [
+        paddle.unsqueeze(paddle.transpose(dists, [2, 0, 1]), axis=1)
+    ]  # n_objects, num_multi_local, h, w
+
+    reshaped_d_masked = paddle.reshape(d_masked, [
+        height, width, 2 * atrous_max_distance + 1, 2 * atrous_max_distance + 1,
+        obj_num
+    ])
+    for local_dis in multi_local_distance[:-1]:
+        local_dis = local_dis // atrous_rate
+        start_idx = atrous_max_distance - local_dis
+        end_idx = atrous_max_distance + local_dis + 1
+        new_d_masked = paddle.reshape(
+            reshaped_d_masked[:, :, start_idx:end_idx, start_idx:end_idx, :],
+            reshaped_d_masked[:, :, start_idx:end_idx,
+                              start_idx:end_idx, :].shape)
+        new_d_masked = paddle.reshape(new_d_masked,
+                                      [height, width, -1, obj_num])
+        new_dists = paddle.min(new_d_masked, axis=2)
+        new_dists = paddle.unsqueeze(paddle.transpose(new_dists, [2, 0, 1]),
+                                     axis=1)
+        multi_dists.append(new_dists)
+
+    multi_dists = paddle.concat(multi_dists, axis=1)
+    multi_dists = (F.sigmoid(multi_dists +
+                             paddle.reshape(dis_bias, [-1, 1, 1, 1])) - 0.5) * 2
+
+    if use_float16:
+        multi_dists = paddle.cast(multi_dists, dtype="float32")
+
+    if (height, width) != ori_size:
+        multi_dists = F.interpolate(multi_dists,
+                                    size=ori_size,
+                                    mode='bilinear',
+                                    align_corners=True)
+    multi_dists = paddle.transpose(multi_dists, perm=[2, 3, 0, 1])
+    multi_dists = paddle.reshape(multi_dists,
+                                 [1, ori_size[0], ori_size[1], obj_num, -1])
+
+    return multi_dists
+
+
+def calculate_attention_head(ref_embedding,
+                             ref_label,
+                             prev_embedding,
+                             prev_label,
+                             epsilon=1e-5):
+
+    ref_head = ref_embedding * ref_label
+    ref_head_pos = paddle.sum(ref_head, axis=(2, 3))
+    ref_head_neg = paddle.sum(ref_embedding, axis=(2, 3)) - ref_head_pos
+    ref_pos_num = paddle.sum(ref_label, axis=(2, 3))
+    ref_neg_num = paddle.sum(1. - ref_label, axis=(2, 3))
+    ref_head_pos = ref_head_pos / (ref_pos_num + epsilon)
+    ref_head_neg = ref_head_neg / (ref_neg_num + epsilon)
+
+    prev_head = prev_embedding * prev_label
+    prev_head_pos = paddle.sum(prev_head, axis=(2, 3))
+    prev_head_neg = paddle.sum(prev_embedding, axis=(2, 3)) - prev_head_pos
+    prev_pos_num = paddle.sum(prev_label, axis=(2, 3))
+    prev_neg_num = paddle.sum(1. - prev_label, axis=(2, 3))
+    prev_head_pos = prev_head_pos / (prev_pos_num + epsilon)
+    prev_head_neg = prev_head_neg / (prev_neg_num + epsilon)
+
+    total_head = paddle.concat(
+        x=[ref_head_pos, ref_head_neg, prev_head_pos, prev_head_neg], axis=1)
+
+    return total_head
+
+
+def calculate_attention_head_for_eval(ref_embeddings,
+                                      ref_labels,
+                                      prev_embedding,
+                                      prev_label,
+                                      epsilon=1e-5):
+    total_ref_head_pos = 0.
+    total_ref_head_neg = 0.
+    total_ref_pos_num = 0.
+    total_ref_neg_num = 0.
+
+    for idx in range(len(ref_embeddings)):
+        ref_embedding = ref_embeddings[idx]
+        ref_label = ref_labels[idx]
+        ref_head = ref_embedding * ref_label
+        ref_head_pos = paddle.sum(ref_head, axis=(2, 3))
+        ref_head_neg = paddle.sum(ref_embedding, axis=(2, 3)) - ref_head_pos
+        ref_pos_num = paddle.sum(ref_label, axis=(2, 3))
+        ref_neg_num = paddle.sum(1. - ref_label, axis=(2, 3))
+        total_ref_head_pos = total_ref_head_pos + ref_head_pos
+        total_ref_head_neg = total_ref_head_neg + ref_head_neg
+        total_ref_pos_num = total_ref_pos_num + ref_pos_num
+        total_ref_neg_num = total_ref_neg_num + ref_neg_num
+    ref_head_pos = total_ref_head_pos / (total_ref_pos_num + epsilon)
+    ref_head_neg = total_ref_head_neg / (total_ref_neg_num + epsilon)
+
+    prev_head = prev_embedding * prev_label
+    prev_head_pos = paddle.sum(prev_head, axis=(2, 3))
+    prev_head_neg = paddle.sum(prev_embedding, axis=(2, 3)) - prev_head_pos
+    prev_pos_num = paddle.sum(prev_label, axis=(2, 3))
+    prev_neg_num = paddle.sum(1. - prev_label, axis=(2, 3))
+    prev_head_pos = prev_head_pos / (prev_pos_num + epsilon)
+    prev_head_neg = prev_head_neg / (prev_neg_num + epsilon)
+
+    total_head = paddle.concat(
+        x=[ref_head_pos, ref_head_neg, prev_head_pos, prev_head_neg], axis=1)
+    return total_head
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__init__.py
new file mode 100644
index 0000000..de4bf57
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from .base import BaseSegmenter
+from .ms_tcn import MSTCN
+from .asrf import ASRF
+
+__all__ = ['BaseSegmenter', 'MSTCN', 'ASRF']
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..c427464
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/asrf.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/asrf.cpython-310.pyc
new file mode 100644
index 0000000..a035c2c
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/asrf.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000..abe08a4
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/base.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/ms_tcn.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/ms_tcn.cpython-310.pyc
new file mode 100644
index 0000000..8bcaf14
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/ms_tcn.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/utils.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000..0acc81f
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/utils.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/asrf.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/asrf.py
new file mode 100644
index 0000000..3d962c7
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/asrf.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import SEGMENTERS
+from .base import BaseSegmenter
+
+import paddle
+import paddle.nn.functional as F
+from .utils import ASRFPostProcessing
+
+
+@SEGMENTERS.register()
+class ASRF(BaseSegmenter):
+    """ASRF model framework."""
+
+    def __init__(self,
+                 postprocessing_method,
+                 boundary_threshold,
+                 backbone=None,
+                 head=None,
+                 loss=None):
+
+        super().__init__(backbone=backbone, head=head, loss=loss)
+        self.postprocessing_method = postprocessing_method
+        self.boundary_threshold = boundary_threshold
+
+    def forward_net(self, video_feature):
+        """Define how the model is going to train, from input to output.
+        """
+        if self.backbone is not None:
+            feature = self.backbone(video_feature)
+        else:
+            feature = video_feature
+
+        if self.head is not None:
+            network_outputs = self.head(feature)
+        else:
+            network_outputs = None
+
+        return network_outputs
+
+    def train_step(self, data_batch):
+        """Training step.
+        """
+        feature, label, boundary = data_batch
+        # call forward
+        outputs_cls, outputs_boundary = self.forward_net(feature)
+
+        # transfer data
+        outputs_cls_np = outputs_cls[-1].numpy()
+        outputs_boundary_np = outputs_boundary[-1].numpy()
+
+        # caculate loss
+        if self.loss is not None:
+            output_loss = self.loss(feature, outputs_cls, label,
+                                    outputs_boundary, boundary)
+        else:
+            output_loss = None
+
+        # predict post process
+        predicted = ASRFPostProcessing(outputs_cls_np, outputs_boundary_np,
+                                       self.postprocessing_method)
+        predicted = paddle.squeeze(predicted)
+
+        loss_metrics = dict()
+        loss_metrics['loss'] = output_loss
+        loss_metrics['F1@0.50'] = self.head.get_F1_score(predicted, label)
+
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        """Validating setp.
+        """
+        feature, label, boundary = data_batch
+
+        # call forward
+        outputs_cls, outputs_boundary = self.forward_net(feature)
+
+        # transfer data
+        outputs_cls_np = outputs_cls[-1].numpy()
+        outputs_boundary_np = outputs_boundary[-1].numpy()
+
+        ## caculate loss
+        if self.loss is not None:
+            output_loss = self.loss(feature, outputs_cls, label,
+                                    outputs_boundary, boundary)
+        else:
+            output_loss = None
+
+        # predict post process
+        predicted = ASRFPostProcessing(outputs_cls_np, outputs_boundary_np,
+                                       self.postprocessing_method)
+        predicted = paddle.squeeze(predicted)
+
+        outputs_dict = dict()
+        outputs_dict['loss'] = output_loss
+        outputs_dict['F1@0.50'] = self.head.get_F1_score(predicted, label)
+        return outputs_dict
+
+    def test_step(self, data_batch):
+        """Testing setp.
+        """
+        feature, _, _ = data_batch
+
+        outputs_dict = dict()
+        # call forward
+        outputs_cls, outputs_boundary = self.forward_net(feature)
+        # transfer data
+        outputs_cls_np = outputs_cls[-1].numpy()
+        outputs_boundary_np = outputs_boundary[-1].numpy()
+
+        # predict post process
+        predicted = ASRFPostProcessing(outputs_cls_np, outputs_boundary_np,
+                                       self.postprocessing_method)
+        outputs_dict['predict'] = paddle.to_tensor(predicted[0, :])
+        outputs_dict['output_np'] = F.sigmoid(outputs_cls[-1])
+        return outputs_dict
+
+    def infer_step(self, data_batch):
+        """Infering setp.
+        """
+        feature = data_batch[0]
+
+        # call forward
+        outputs_cls, outputs_boundary = self.forward_net(feature)
+        # transfer data
+        outputs_cls_np = outputs_cls[-1]
+        outputs_boundary_np = outputs_boundary[-1]
+
+        outputs = [
+            outputs_cls_np, outputs_boundary_np,
+            F.sigmoid(outputs_cls[-1])
+        ]
+        return outputs
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/base.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/base.py
new file mode 100644
index 0000000..e0856d9
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/base.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from abc import abstractmethod
+from ... import builder
+import paddle.nn as nn
+
+
+class BaseSegmenter(nn.Layer):
+    """Base class for segementers.
+
+    All segementers should subclass it.
+    All subclass should overwrite:
+
+    - Methods:``train_step``, supporting to forward when training.
+    - Methods:``valid_step``, supporting to forward when validating.
+    - Methods:``test_step``, supporting to forward when testing.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        head (dict): Classification head to process feature.
+
+    """
+
+    def __init__(self, backbone=None, head=None, loss=None):
+
+        super().__init__()
+        # build backbone
+        if backbone is not None:
+            self.backbone = builder.build_backbone(backbone)
+            if hasattr(self.backbone, 'init_weights'):
+                self.backbone.init_weights()
+        else:
+            self.backbone = None
+        # build head
+        if head is not None:
+            self.head_name = head.name
+            self.head = builder.build_head(head)
+            if hasattr(self.head, 'init_weights'):
+                self.head.init_weights()
+        else:
+            self.head = None
+        # build loss
+        if loss is not None:
+            self.loss_name = loss.name
+            self.loss = builder.build_loss(loss)
+            if hasattr(self.loss, 'init_weights'):
+                self.loss.init_weights()
+        else:
+            self.loss = None
+
+    def forward(self, data_batch, mode='infer'):
+        """
+        1. Define how the model is going to run, from input to output.
+        2. Console of train, valid, test or infer step
+        3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py
+        """
+        if mode == 'train':
+            return self.train_step(data_batch)
+        elif mode == 'valid':
+            return self.val_step(data_batch)
+        elif mode == 'test':
+            return self.test_step(data_batch)
+        elif mode == 'infer':
+            return self.infer_step(data_batch)
+        else:
+            raise NotImplementedError
+
+    @abstractmethod
+    def train_step(self, data_batch, **kwargs):
+        """Training step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def val_step(self, data_batch, **kwargs):
+        """Validating step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def test_step(self, data_batch, **kwargs):
+        """Test step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def infer_step(self, data_batch, **kwargs):
+        """Infer step.
+        """
+        raise NotImplementedError
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/ms_tcn.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/ms_tcn.py
new file mode 100644
index 0000000..a5982a7
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/ms_tcn.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ...registry import SEGMENTERS
+from .base import BaseSegmenter
+
+import paddle
+import paddle.nn.functional as F
+
+
+@SEGMENTERS.register()
+class MSTCN(BaseSegmenter):
+    """MS-TCN model framework."""
+
+    def forward_net(self, video_feature):
+        """Define how the model is going to train, from input to output.
+        """
+        if self.backbone is not None:
+            feature = self.backbone(video_feature)
+        else:
+            feature = video_feature
+
+        if self.head is not None:
+            cls_score = self.head(feature)
+        else:
+            cls_score = None
+
+        return cls_score
+
+    def train_step(self, data_batch):
+        """Training step.
+        """
+        video_feat, video_gt = data_batch
+
+        # call forward
+        output = self.forward_net(video_feat)
+        loss = 0.
+        for i in range(len(output)):
+            loss += self.head.loss(output[i], video_gt)
+
+        predicted = paddle.argmax(output[-1], axis=1)
+        predicted = paddle.squeeze(predicted)
+
+        loss_metrics = dict()
+        loss_metrics['loss'] = loss
+        loss_metrics['F1@0.50'] = self.head.get_F1_score(predicted, video_gt)
+        return loss_metrics
+
+    def val_step(self, data_batch):
+        """Validating setp.
+        """
+        video_feat, video_gt = data_batch
+
+        # call forward
+        output = self.forward_net(video_feat)
+        loss = 0.
+        for i in range(len(output)):
+            loss += self.head.loss(output[i], video_gt)
+
+        predicted = paddle.argmax(output[-1], axis=1)
+        predicted = paddle.squeeze(predicted)
+
+        outputs_dict = dict()
+        outputs_dict['loss'] = loss
+        outputs_dict['F1@0.50'] = self.head.get_F1_score(predicted, video_gt)
+        return outputs_dict
+
+    def test_step(self, data_batch):
+        """Testing setp.
+        """
+        video_feat, _ = data_batch
+
+        outputs_dict = dict()
+        # call forward
+        output = self.forward_net(video_feat)
+        predicted = paddle.argmax(output[-1], axis=1)
+        predicted = paddle.squeeze(predicted)
+        outputs_dict['predict'] = predicted
+        outputs_dict['output_np'] = F.sigmoid(output[-1])
+        return outputs_dict
+
+    def infer_step(self, data_batch):
+        """Infering setp.
+        """
+        video_feat = data_batch[0]
+
+        # call forward
+        output = self.forward_net(video_feat)
+        predicted = paddle.argmax(output[-1], axis=1)
+        predicted = paddle.squeeze(predicted)
+        output_np = F.sigmoid(output[-1])
+        return predicted, output_np
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/utils.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/utils.py
new file mode 100644
index 0000000..9c21cbb
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/utils.py
@@ -0,0 +1,343 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+# https://github.com/yiskw713/asrf/libs/postprocess.py
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+import math
+
+
+class GaussianSmoothing(nn.Layer):
+    """
+    Apply gaussian smoothing on a 1d tensor.
+    Filtering is performed seperately for each channel
+    in the input using a depthwise convolution.
+    Arguments:
+        channels (int, sequence): Number of channels of the input tensors. Output will
+            have this number of channels as well.
+        kernel_size (int, sequence): Size of the gaussian kernel.
+        sigma (float, sequence): Standard deviation of the gaussian kernel.
+    """
+
+    def __init__(self, kernel_size=15, sigma=1.0):
+        super().__init__()
+        self.kernel_size = kernel_size
+
+        # The gaussian kernel is the product of the
+        # gaussian function of each dimension.
+        kernel = 1
+        meshgrid = paddle.arange(kernel_size)
+
+        meshgrid = paddle.cast(meshgrid, dtype='float32')
+
+        mean = (kernel_size - 1) / 2
+        kernel = kernel / (sigma * math.sqrt(2 * math.pi))
+        kernel = kernel * paddle.exp(-(((meshgrid - mean) / sigma)**2) / 2)
+
+        # Make sure sum of values in gaussian kernel equals 1.
+        # kernel = kernel / paddle.max(kernel)
+
+        self.kernel = paddle.reshape(kernel, [1, 1, -1])
+
+    def forward(self, inputs):
+        """
+        Apply gaussian filter to input.
+        Arguments:
+            input (paddle.Tensor): Input to apply gaussian filter on.
+        Returns:
+            filtered (paddle.Tensor): Filtered output.
+        """
+        _, c, _ = inputs.shape
+        inputs = F.pad(inputs,
+                       pad=((self.kernel_size - 1) // 2,
+                            (self.kernel_size - 1) // 2),
+                       mode="reflect",
+                       data_format='NCL')
+
+        kernel = paddle.expand(self.kernel, shape=[c, 1, self.kernel_size])
+        return F.conv1d(inputs, weight=kernel, groups=c)
+
+
+def argrelmax(prob, threshold=0.7):
+    """
+    Calculate arguments of relative maxima.
+    prob: np.array. boundary probability maps distributerd in [0, 1]
+    prob shape is (T)
+    ignore the peak whose value is under threshold
+
+    Return:
+        Index of peaks for each batch
+    """
+    # ignore the values under threshold
+    prob[prob < threshold] = 0.0
+
+    # calculate the relative maxima of boundary maps
+    # treat the first frame as boundary
+    peak = np.concatenate(
+        [
+            np.ones((1), dtype=np.bool),
+            (prob[:-2] < prob[1:-1]) & (prob[2:] < prob[1:-1]),
+            np.zeros((1), dtype=np.bool),
+        ],
+        axis=0,
+    )
+
+    peak_idx = np.where(peak)[0].tolist()
+
+    return peak_idx
+
+
+def is_probability(x):
+    assert x.ndim == 3
+
+    if x.shape[1] == 1:
+        # sigmoid
+        if x.min() >= 0 and x.max() <= 1:
+            return True
+        else:
+            return False
+    else:
+        # softmax
+        _sum = np.sum(x, axis=1).astype(np.float32)
+        _ones = np.ones_like(_sum, dtype=np.float32)
+        return np.allclose(_sum, _ones)
+
+
+def convert2probability(x):
+    """
+    Args: x (N, C, T)
+    """
+    assert x.ndim == 3
+
+    if is_probability(x):
+        return x
+    else:
+        if x.shape[1] == 1:
+            # sigmoid
+            prob = 1 / (1 + np.exp(-x))
+        else:
+            # softmax
+            prob = np.exp(x) / np.sum(np.exp(x), axis=1)
+        return prob.astype(np.float32)
+
+
+def convert2label(x):
+    assert x.ndim == 2 or x.ndim == 3
+
+    if x.ndim == 2:
+        return x.astype(np.int64)
+    else:
+        if not is_probability(x):
+            x = convert2probability(x)
+
+        label = np.argmax(x, axis=1)
+        return label.astype(np.int64)
+
+
+def refinement_with_boundary(outputs, boundaries, boundary_threshold):
+    """
+    Get segments which is defined as the span b/w two boundaries,
+    and decide their classes by majority vote.
+    Args:
+        outputs: numpy array. shape (N, C, T)
+            the model output for frame-level class prediction.
+        boundaries: numpy array.  shape (N, 1, T)
+            boundary prediction.
+        boundary_threshold: the threshold of the size of action segments. float(default=0.7)
+    Return:
+        preds: np.array. shape (N, T)
+            final class prediction considering boundaries.
+    """
+
+    preds = convert2label(outputs)
+    boundaries = convert2probability(boundaries)
+
+    for i, (output, pred, boundary) in enumerate(zip(outputs, preds,
+                                                     boundaries)):
+        idx = argrelmax(boundary[0, :], threshold=boundary_threshold)
+
+        # add the index of the last action ending
+        T = pred.shape[0]
+        idx.append(T)
+
+        # majority vote
+        for j in range(len(idx) - 1):
+            count = np.bincount(pred[idx[j]:idx[j + 1]])
+            modes = np.where(count == count.max())[0]
+            if len(modes) == 1:
+                mode = modes
+            else:
+                if outputs.ndim == 3:
+                    # if more than one majority class exist
+                    prob_sum_max = 0
+                    for m in modes:
+                        prob_sum = output[m, idx[j]:idx[j + 1]].sum()
+                        if prob_sum_max < prob_sum:
+                            mode = m
+                            prob_sum_max = prob_sum
+                else:
+                    # decide first mode when more than one majority class
+                    # have the same number during oracle experiment
+                    mode = modes[0]
+
+            preds[i, idx[j]:idx[j + 1]] = mode
+    return preds
+
+
+def relabeling(outputs, theta_t):
+    """
+        Relabeling small action segments with their previous action segment
+        Args:
+            output: the results of action segmentation. (N, T) or (N, C, T)
+            theta_t: the threshold of the size of action segments.
+        Return:
+            relabeled output. (N, T)
+        """
+
+    preds = convert2label(outputs)
+
+    for i in range(preds.shape[0]):
+        # shape (T,)
+        last = preds[i][0]
+        cnt = 1
+        for j in range(1, preds.shape[1]):
+            if last == preds[i][j]:
+                cnt += 1
+            else:
+                if cnt > theta_t:
+                    cnt = 1
+                    last = preds[i][j]
+                else:
+                    preds[i][j - cnt:j] = preds[i][j - cnt - 1]
+                    cnt = 1
+                    last = preds[i][j]
+
+        if cnt <= theta_t:
+            preds[i][j - cnt:j] = preds[i][j - cnt - 1]
+
+    return preds
+
+
+def smoothing(outputs, filter_func):
+    """
+        Smoothing action probabilities with gaussian filter.
+        Args:
+            outputs: frame-wise action probabilities. (N, C, T)
+        Return:
+            predictions: final prediction. (N, T)
+        """
+
+    outputs = convert2probability(outputs)
+    outputs = filter_func(paddle.to_tensor(outputs)).numpy()
+
+    preds = convert2label(outputs)
+    return preds
+
+
+def ASRFPostProcessing(outputs_cls,
+                       outputs_boundary,
+                       refinement_method,
+                       boundary_threshold=0.7,
+                       theta_t=15,
+                       kernel_size=15):
+    """
+    ASRF post processing is to refine action boundary
+    Args:
+        outputs_cls: the results of action segmentation. (N, T) or (N, C, T)
+        outputs_boundary: action boundary probability. (N, 1, T)
+        refinement_method: the way of refine predict boundary and classification. str
+        boundary_threshold: the threshold of the size of action segments. float(default=0.7)
+        theta_t: the threshold of the size of action segments. int(default=15)
+        kernel_size: Size of the gaussian kernel. int(default=15)
+    Return:
+        preds output. (N, T)
+    """
+    func = [
+        "refinement_with_boundary",
+        "relabeling",
+        "smoothing",
+    ]
+
+    if refinement_method == "smoothing":
+        filter_func = GaussianSmoothing(kernel_size)
+        preds = smoothing(outputs_cls, filter_func)
+    elif refinement_method == "relabeling":
+        preds = relabeling(outputs_cls, theta_t)
+    elif refinement_method == "refinement_with_boundary":
+        preds = refinement_with_boundary(outputs_cls, outputs_boundary,
+                                         boundary_threshold)
+    else:
+        preds = np.zeros((1, 1))
+        assert refinement_method in func
+
+    return paddle.to_tensor(preds)
+
+
+def _calculate_fan_in_and_fan_out(tensor):
+    dimensions = len(tensor.shape)
+    if dimensions < 2:
+        raise ValueError("Fan in and fan out can not be computed \
+        for tensor with fewer than 2 dimensions")
+
+    if dimensions == 2:  # Linear
+        fan_in = tensor.shape[1]
+        fan_out = tensor.shape[0]
+    else:
+        num_input_fmaps = tensor.shape[1]
+        num_output_fmaps = tensor.shape[0]
+        receptive_field_size = 1
+        if tensor.dim() > 2:
+            receptive_field_size = tensor[0][0].numel()
+        fan_in = num_input_fmaps * receptive_field_size
+        fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def calculate_gain(nonlinearity=None, a=None):
+    if nonlinearity == 'tanh':
+        return 5.0 / 3
+    elif nonlinearity == 'relu':
+        return math.sqrt(2.0)
+    elif nonlinearity == 'leaky_relu':
+        if a is not None:
+            return math.sqrt(2.0 / (1 + a**2))
+        else:
+            return math.sqrt(2.0 / (1 + 0.01**2))
+    elif nonlinearity == 'selu':
+        return 3.0 / 4
+    else:
+        return 1
+
+
+def KaimingUniform_like_torch(weight_npy,
+                              mode='fan_in',
+                              nonlinearity='leaky_relu'):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy)
+    if mode == 'fan_in':
+        fan_mode = fan_in
+    else:
+        fan_mode = fan_out
+    a = math.sqrt(5.0)
+    gain = calculate_gain(nonlinearity=nonlinearity, a=a)
+    std = gain / math.sqrt(fan_mode)
+    bound = math.sqrt(3.0) * std
+    return np.random.uniform(-bound, bound, weight_npy.shape)
+
+
+def init_bias(weight_npy, bias_npy):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy)
+    bound = 1.0 / math.sqrt(fan_in)
+    return np.random.uniform(-bound, bound, bias_npy.shape)
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__init__.py
new file mode 100644
index 0000000..49f71cc
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__init__.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .adds_head import AddsHead
+from .asrf_head import ASRFHead
+from .attention_lstm_head import AttentionLstmHead, ActionAttentionLstmHead
+from .base import BaseHead
+from .bbox_head import BBoxHeadAVA
+from .cfbi_head import CollaborativeEnsemblerMS
+from .i3d_head import I3DHead
+from .movinet_head import MoViNetHead
+from .ms_tcn_head import MSTCNHead
+from .pptimesformer_head import ppTimeSformerHead
+from .pptsm_head import ppTSMHead
+from .pptsn_head import ppTSNHead
+from .roi_head import AVARoIHead
+from .single_straight3d import SingleRoIExtractor3D
+from .slowfast_head import SlowFastHead
+from .stgcn_head import STGCNHead
+from .timesformer_head import TimeSformerHead
+from .transnetv2_head import TransNetV2Head
+from .tsm_head import TSMHead
+from .tsn_head import TSNHead
+from .ms_tcn_head import MSTCNHead
+from .asrf_head import ASRFHead
+from .ctrgcn_head import CTRGCNHead
+from .movinet_head import MoViNetHead
+from .agcn2s_head import AGCN2sHead
+from .token_shift_head import TokenShiftHead
+
+__all__ = [
+    'BaseHead', 'TSNHead', 'TSMHead', 'ppTSMHead', 'ppTSNHead', 'SlowFastHead',
+    'AttentionLstmHead', 'TimeSformerHead', 'STGCNHead', 'TransNetV2Head',
+    'I3DHead', 'SingleRoIExtractor3D', 'AVARoIHead', 'BBoxHeadAVA', 'AddsHead',
+    'ppTimeSformerHead', 'CollaborativeEnsemblerMS', 'MSTCNHead', 'ASRFHead',
+    'MoViNetHead', 'CTRGCNHead', 'TokenShiftHead', 'ActionAttentionLstmHead',
+    'AGCN2sHead'
+]
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..b559ad2
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/adds_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/adds_head.cpython-310.pyc
new file mode 100644
index 0000000..2227f8f
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/adds_head.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/agcn2s_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/agcn2s_head.cpython-310.pyc
new file mode 100644
index 0000000..6fc341a
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/agcn2s_head.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/asrf_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/asrf_head.cpython-310.pyc
new file mode 100644
index 0000000..77d72d8
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/asrf_head.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/attention_lstm_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/attention_lstm_head.cpython-310.pyc
new file mode 100644
index 0000000..ec74413
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/attention_lstm_head.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000..8ca7372
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/base.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/bbox_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/bbox_head.cpython-310.pyc
new file mode 100644
index 0000000..e862576
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/bbox_head.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/cfbi_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/cfbi_head.cpython-310.pyc
new file mode 100644
index 0000000..e0af3e0
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/cfbi_head.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/ctrgcn_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/ctrgcn_head.cpython-310.pyc
new file mode 100644
index 0000000..77001ca
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/ctrgcn_head.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/i3d_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/i3d_head.cpython-310.pyc
new file mode 100644
index 0000000..78535f4
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/i3d_head.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/movinet_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/movinet_head.cpython-310.pyc
new file mode 100644
index 0000000..981fd7a
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/movinet_head.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/ms_tcn_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/ms_tcn_head.cpython-310.pyc
new file mode 100644
index 0000000..69bf0c6
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/ms_tcn_head.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/pptimesformer_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/pptimesformer_head.cpython-310.pyc
new file mode 100644
index 0000000..966829f
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/pptimesformer_head.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/pptsm_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/pptsm_head.cpython-310.pyc
new file mode 100644
index 0000000..b8382f0
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/pptsm_head.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/pptsn_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/pptsn_head.cpython-310.pyc
new file mode 100644
index 0000000..90b5293
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/pptsn_head.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/roi_extractor.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/roi_extractor.cpython-310.pyc
new file mode 100644
index 0000000..e70db29
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/roi_extractor.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/roi_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/roi_head.cpython-310.pyc
new file mode 100644
index 0000000..194ca51
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/roi_head.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/single_straight3d.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/single_straight3d.cpython-310.pyc
new file mode 100644
index 0000000..e3ab758
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/single_straight3d.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/slowfast_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/slowfast_head.cpython-310.pyc
new file mode 100644
index 0000000..f7c5b61
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/slowfast_head.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/stgcn_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/stgcn_head.cpython-310.pyc
new file mode 100644
index 0000000..6acc1c5
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/stgcn_head.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/timesformer_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/timesformer_head.cpython-310.pyc
new file mode 100644
index 0000000..a8276cf
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/timesformer_head.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/token_shift_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/token_shift_head.cpython-310.pyc
new file mode 100644
index 0000000..141da04
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/token_shift_head.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/transnetv2_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/transnetv2_head.cpython-310.pyc
new file mode 100644
index 0000000..c993c17
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/transnetv2_head.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/tsm_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/tsm_head.cpython-310.pyc
new file mode 100644
index 0000000..991d912
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/tsm_head.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/tsn_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/tsn_head.cpython-310.pyc
new file mode 100644
index 0000000..abb2d14
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/tsn_head.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/adds_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/adds_head.py
new file mode 100644
index 0000000..3b1cd24
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/adds_head.py
@@ -0,0 +1,146 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+import paddle.nn as nn
+from paddlevideo.utils import get_dist_info
+import paddle
+from ..builder import build_loss
+from ..registry import HEADS
+
+MIN_DEPTH = 1e-3
+MAX_DEPTH = 80
+
+
+@HEADS.register()
+class AddsHead(nn.Layer):
+    """TimeSformerHead Head.
+
+    Args:
+        num_classes (int): The number of classes to be classified.
+        in_channels (int): The number of channles in input feature.
+        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+        std(float): Std(Scale) value in normal initilizar. Default: 0.01.
+        kwargs (dict, optional): Any keyword argument to initialize.
+
+    """
+    def __init__(self,
+                 avg_reprojection,
+                 disparity_smoothness,
+                 no_ssim,
+                 loss_cfg=dict(name='ADDSLoss'),
+                 max_gt_depth=60,
+                 pred_depth_scale_factor=1):
+
+        super(AddsHead, self).__init__()
+        loss_cfg['avg_reprojection'] = avg_reprojection
+        loss_cfg['disparity_smoothness'] = disparity_smoothness
+        loss_cfg['no_ssim'] = no_ssim
+        self.max_gt_depth = max_gt_depth
+        self.pred_depth_scale_factor = pred_depth_scale_factor
+        self.loss_func = build_loss(loss_cfg)
+
+    def forward(self):
+        raise NotImplemented
+
+    def loss(self, inputs, outputs):
+        if self.training:
+            return self.loss_func(inputs, outputs)
+        else:
+            abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 = self.get_metrics(
+                outputs['pred_disp'], outputs['gt'])
+            outputs['abs_rel'] = abs_rel
+            outputs['sq_rel'] = sq_rel
+            outputs['rmse'] = rmse
+            outputs['rmse_log'] = rmse_log
+            outputs['a1'] = a1
+            outputs['a2'] = a2
+            outputs['a3'] = a3
+            return outputs
+
+    def get_metrics(self, pred_disp, gt_depth):
+        gt_height, gt_width = gt_depth.shape[:2]
+
+        pred_disp = cv2.resize(pred_disp, (gt_width, gt_height))
+        pred_depth = 1 / pred_disp
+
+        mask = gt_depth > 0
+
+        pred_depth = pred_depth[mask]
+        gt_depth = gt_depth[mask]
+
+        pred_depth *= self.pred_depth_scale_factor
+        ratio = np.median(gt_depth) / np.median(pred_depth)
+        pred_depth *= ratio
+
+        pred_depth[pred_depth < MIN_DEPTH] = MIN_DEPTH
+        pred_depth[pred_depth > MAX_DEPTH] = MAX_DEPTH
+
+        mask2 = gt_depth <= self.max_gt_depth
+        pred_depth = pred_depth[mask2]
+        gt_depth = gt_depth[mask2]
+
+        abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 = self.compute_errors(
+            gt_depth, pred_depth)
+
+        _, world_size = get_dist_info()
+        if world_size > 1:
+            # educe sum when valid
+            # TODO: there are some problems with multi gpu gather code.
+            abs_rel = paddle.to_tensor(abs_rel)
+            sq_rel = paddle.to_tensor(sq_rel)
+            rmse = paddle.to_tensor(rmse)
+            rmse_log = paddle.to_tensor(rmse_log)
+            a1 = paddle.to_tensor(a1)
+            a2 = paddle.to_tensor(a2)
+            a3 = paddle.to_tensor(a3)
+            abs_rel = paddle.distributed.all_reduce(
+                abs_rel, op=paddle.distributed.ReduceOp.SUM) / world_size
+            sq_rel = paddle.distributed.all_reduce(
+                sq_rel, op=paddle.distributed.ReduceOp.SUM) / world_size
+            rmse = paddle.distributed.all_reduce(
+                rmse, op=paddle.distributed.ReduceOp.SUM) / world_size
+            rmse_log = paddle.distributed.all_reduce(
+                rmse_log, op=paddle.distributed.ReduceOp.SUM) / world_size
+            a1 = paddle.distributed.all_reduce(
+                a1, op=paddle.distributed.ReduceOp.SUM) / world_size
+            a2 = paddle.distributed.all_reduce(
+                a2, op=paddle.distributed.ReduceOp.SUM) / world_size
+            a3 = paddle.distributed.all_reduce(
+                a3, op=paddle.distributed.ReduceOp.SUM) / world_size
+            return abs_rel.item(), sq_rel.item(), rmse.item(), rmse_log.item(
+            ), a1.item(), a2.item(), a3.item()
+
+        return abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3
+
+    def compute_errors(self, gt, pred):
+        """Computation of error metrics between predicted and ground truth depths
+        """
+        thresh = np.maximum((gt / pred), (pred / gt))
+        a1 = (thresh < 1.25).mean()
+        a2 = (thresh < 1.25**2).mean()
+        a3 = (thresh < 1.25**3).mean()
+
+        rmse = (gt - pred)**2
+        rmse = np.sqrt(rmse.mean())
+
+        rmse_log = (np.log(gt) - np.log(pred))**2
+        rmse_log = np.sqrt(rmse_log.mean())
+
+        abs_rel = np.mean(np.abs(gt - pred) / gt)
+
+        sq_rel = np.mean(((gt - pred)**2) / gt)
+
+        return abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/agcn2s_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/agcn2s_head.py
new file mode 100644
index 0000000..92cb5e4
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/agcn2s_head.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn as nn
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class AGCN2sHead(BaseHead):
+    """
+    Head for AGCN2s model.
+    Args:
+        in_channels: int, input feature channels. Default: 64.
+        num_classes: int, output the number of classes.
+        M: int, number of people.
+        drop_out: float, dropout ratio of layer. Default: 0.
+    """
+    def __init__(self, in_channels=64, num_classes=10, M=2, **kwargs):
+        super().__init__(num_classes, in_channels, **kwargs)
+        self.in_channels = in_channels
+        self.M = M
+        weight_attr = paddle.ParamAttr(
+            name="linear_weight",
+            initializer=paddle.nn.initializer.Normal(mean=0.0,
+                                                     std=math.sqrt(
+                                                         2. / num_classes)))
+
+        self.fc = nn.Linear(self.in_channels * 4,
+                            self.num_classes,
+                            weight_attr=weight_attr)
+
+    def forward(self, x):
+        """Define how the head is going to run.
+        """
+        assert x.shape[
+            0] % self.M == 0, f'The first dimension of the output must be an integer multiple of the number of people M, but recieved shape[0]={x.shape[0]}, M={self.M}'
+        # N*M,C,T,V
+        N = x.shape[0] // self.M
+        c_new = x.shape[1]
+        x = x.reshape([N, self.M, c_new, -1])
+        x = x.mean(3).mean(1)
+
+        return self.fc(x)
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/asrf_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/asrf_head.py
new file mode 100644
index 0000000..c3aab77
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/asrf_head.py
@@ -0,0 +1,212 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# https://github.com/yiskw713/asrf/libs/models/tcn.py
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+
+from paddle import ParamAttr
+
+from ..backbones.ms_tcn import SingleStageModel
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+from ..framework.segmenters.utils import init_bias, KaimingUniform_like_torch
+
+
+@HEADS.register()
+class ASRFHead(BaseHead):
+
+    def __init__(self,
+                 num_classes,
+                 num_features,
+                 num_stages,
+                 num_layers,
+                 num_stages_asb=None,
+                 num_stages_brb=None):
+        super().__init__(num_classes=num_classes, in_channels=num_features)
+        if not isinstance(num_stages_asb, int):
+            num_stages_asb = num_stages
+
+        if not isinstance(num_stages_brb, int):
+            num_stages_brb = num_stages
+
+        self.num_layers = num_layers
+        self.num_stages_asb = num_stages_asb
+        self.num_stages_brb = num_stages_brb
+        self.num_features = num_features
+
+        # cls score
+        self.overlap = 0.5
+
+        self.conv_cls = nn.Conv1D(self.num_features, self.num_classes, 1)
+        self.conv_boundary = nn.Conv1D(self.num_features, 1, 1)
+
+        # action segmentation branch
+        asb = [
+            SingleStageModel(self.num_layers, self.num_features,
+                             self.num_classes, self.num_classes)
+            for _ in range(self.num_stages_asb - 1)
+        ]
+
+        # boundary regression branch
+        brb = [
+            SingleStageModel(self.num_layers, self.num_features, 1, 1)
+            for _ in range(self.num_stages_brb - 1)
+        ]
+        self.brb = nn.LayerList(brb)
+        self.asb = nn.LayerList(asb)
+
+        self.activation_asb = nn.Softmax(axis=1)
+        self.activation_brb = nn.Sigmoid()
+
+    def init_weights(self):
+        """
+        initialize model layers' weight
+        """
+        # init weight
+        for layer in self.sublayers():
+            if isinstance(layer, nn.Conv1D):
+                layer.weight.set_value(
+                    KaimingUniform_like_torch(layer.weight).astype('float32'))
+                if layer.bias is not None:
+                    layer.bias.set_value(
+                        init_bias(layer.weight, layer.bias).astype('float32'))
+
+    def forward(self, x):
+        """
+        ASRF head
+        """
+        out_cls = self.conv_cls(x)
+        out_boundary = self.conv_boundary(x)
+
+        outputs_cls = [out_cls]
+        outputs_boundary = [out_boundary]
+
+        for as_stage in self.asb:
+            out_cls = as_stage(self.activation_asb(out_cls))
+            outputs_cls.append(out_cls)
+
+        for br_stage in self.brb:
+            out_boundary = br_stage(self.activation_brb(out_boundary))
+            outputs_boundary.append(out_boundary)
+
+        return outputs_cls, outputs_boundary
+
+    def get_F1_score(self, predicted, groundTruth):
+        recog_content = list(predicted.numpy())
+        gt_content = list(groundTruth[0].numpy())
+
+        # cls score
+        correct = 0
+        total = 0
+        edit = 0
+
+        for i in range(len(gt_content)):
+            total += 1
+
+            if gt_content[i] == recog_content[i]:
+                correct += 1
+
+        edit_num = self.edit_score(recog_content, gt_content)
+        edit += edit_num
+
+        tp, fp, fn = self.f_score(recog_content, gt_content, self.overlap)
+
+        # cls metric
+
+        precision = tp / float(tp + fp)
+        recall = tp / float(fp + fn)
+
+        if precision + recall > 0.0:
+            f1 = 2.0 * (precision * recall) / (precision + recall)
+        else:
+            f1 = 0.0
+        f1 = np.nan_to_num(f1)
+        return f1
+
+    def get_labels_start_end_time(self, frame_wise_labels):
+        labels = []
+        starts = []
+        ends = []
+        last_label = frame_wise_labels[0]
+        labels.append(frame_wise_labels[0])
+        starts.append(0)
+        for i in range(len(frame_wise_labels)):
+            if frame_wise_labels[i] != last_label:
+                labels.append(frame_wise_labels[i])
+                starts.append(i)
+                ends.append(i)
+                last_label = frame_wise_labels[i]
+        ends.append(i + 1)
+        return labels, starts, ends
+
+    def levenstein(self, p, y, norm=False):
+        m_row = len(p)
+        n_col = len(y)
+        D = np.zeros([m_row + 1, n_col + 1], np.float)
+        for i in range(m_row + 1):
+            D[i, 0] = i
+        for i in range(n_col + 1):
+            D[0, i] = i
+
+        for j in range(1, n_col + 1):
+            for i in range(1, m_row + 1):
+                if y[j - 1] == p[i - 1]:
+                    D[i, j] = D[i - 1, j - 1]
+                else:
+                    D[i, j] = min(D[i - 1, j] + 1, D[i, j - 1] + 1,
+                                  D[i - 1, j - 1] + 1)
+
+        if norm:
+            score = (1 - D[-1, -1] / max(m_row, n_col)) * 100
+        else:
+            score = D[-1, -1]
+
+        return score
+
+    def edit_score(self, recognized, ground_truth, norm=True):
+        P, _, _ = self.get_labels_start_end_time(recognized)
+        Y, _, _ = self.get_labels_start_end_time(ground_truth)
+        return self.levenstein(P, Y, norm)
+
+    def f_score(self, recognized, ground_truth, overlap):
+        p_label, p_start, p_end = self.get_labels_start_end_time(recognized)
+        y_label, y_start, y_end = self.get_labels_start_end_time(ground_truth)
+
+        tp = 0
+        fp = 0
+
+        hits = np.zeros(len(y_label))
+
+        for j in range(len(p_label)):
+            intersection = np.minimum(p_end[j], y_end) - np.maximum(
+                p_start[j], y_start)
+            union = np.maximum(p_end[j], y_end) - np.minimum(
+                p_start[j], y_start)
+            IoU = (1.0 * intersection / union) * (
+                [p_label[j] == y_label[x] for x in range(len(y_label))])
+            # Get the best scoring segment
+            idx = np.array(IoU).argmax()
+
+            if IoU[idx] >= overlap and not hits[idx]:
+                tp += 1
+                hits[idx] = 1
+            else:
+                fp += 1
+        fn = len(y_label) - sum(hits)
+        return float(tp), float(fp), float(fn)
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/attention_lstm_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/attention_lstm_head.py
new file mode 100644
index 0000000..24c31ad
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/attention_lstm_head.py
@@ -0,0 +1,288 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import ParamAttr
+from paddle.nn.initializer import Normal
+from paddle.regularizer import L2Decay
+import paddle.nn.functional as F
+
+from ...metrics.youtube8m import eval_util as youtube8m_metrics
+from ..registry import HEADS
+from ..weight_init import weight_init_
+from .base import BaseHead
+
+
+@HEADS.register()
+class AttentionLstmHead(BaseHead):
+    """AttentionLstmHead.
+    Args: TODO
+    """
+    def __init__(self,
+                 num_classes=3862,
+                 feature_num=2,
+                 feature_dims=[1024, 128],
+                 embedding_size=512,
+                 lstm_size=1024,
+                 in_channels=2048,
+                 loss_cfg=dict(name='CrossEntropyLoss')):
+        super(AttentionLstmHead, self).__init__(num_classes, in_channels,
+                                                loss_cfg)
+        self.num_classes = num_classes
+        self.feature_dims = feature_dims
+        self.embedding_size = embedding_size
+        self.lstm_size = lstm_size
+        self.feature_num = len(self.feature_dims)
+        for i in range(self.feature_num):  # 0:rgb, 1:audio
+            fc_feature = paddle.nn.Linear(in_features=self.feature_dims[i],
+                                          out_features=self.embedding_size)
+            self.add_sublayer("fc_feature{}".format(i), fc_feature)
+
+            bi_lstm = paddle.nn.LSTM(input_size=self.embedding_size,
+                                     hidden_size=self.lstm_size,
+                                     direction="bidirectional")
+            self.add_sublayer("bi_lstm{}".format(i), bi_lstm)
+
+            drop_rate = 0.5
+            self.dropout = paddle.nn.Dropout(drop_rate)
+
+            att_fc = paddle.nn.Linear(in_features=self.lstm_size * 2,
+                                      out_features=1)
+            self.add_sublayer("att_fc{}".format(i), att_fc)
+            self.softmax = paddle.nn.Softmax()
+
+        self.fc_out1 = paddle.nn.Linear(in_features=self.lstm_size * 4,
+                                        out_features=8192,
+                                        bias_attr=ParamAttr(
+                                            regularizer=L2Decay(0.0),
+                                            initializer=Normal()))
+        self.relu = paddle.nn.ReLU()
+        self.fc_out2 = paddle.nn.Linear(in_features=8192,
+                                        out_features=4096,
+                                        bias_attr=ParamAttr(
+                                            regularizer=L2Decay(0.0),
+                                            initializer=Normal()))
+        self.fc_logit = paddle.nn.Linear(in_features=4096,
+                                         out_features=self.num_classes,
+                                         bias_attr=ParamAttr(
+                                             regularizer=L2Decay(0.0),
+                                             initializer=Normal()))
+        self.sigmoid = paddle.nn.Sigmoid()
+
+    def init_weights(self):
+        pass
+
+    def forward(self, inputs):
+        # inputs = [(rgb_data, rgb_len, rgb_mask), (audio_data, audio_len, audio_mask)]
+        # deal with features with different length
+        # 1. padding to same lenght, make a tensor
+        # 2. make a mask tensor with the same shpae with 1
+        # 3. compute output using mask tensor, s.t. output is nothing todo with padding
+        assert (len(inputs) == self.feature_num
+                ), "Input tensor does not contain {} features".format(
+                    self.feature_num)
+        att_outs = []
+        for i in range(len(inputs)):
+            # 1. fc
+            m = getattr(self, "fc_feature{}".format(i))
+            output_fc = m(inputs[i][0])
+            output_fc = paddle.tanh(output_fc)
+
+            # 2. bi_lstm
+            m = getattr(self, "bi_lstm{}".format(i))
+            lstm_out, _ = m(inputs=output_fc, sequence_length=inputs[i][1])
+
+            lstm_dropout = self.dropout(lstm_out)
+
+            # 3. att_fc
+            m = getattr(self, "att_fc{}".format(i))
+            lstm_weight = m(lstm_dropout)
+
+            # 4. softmax replace start, for it's relevant to sum in time step
+            lstm_exp = paddle.exp(lstm_weight)
+            lstm_mask = paddle.mean(inputs[i][2], axis=2)
+            lstm_mask = paddle.unsqueeze(lstm_mask, axis=2)
+            lstm_exp_with_mask = paddle.multiply(x=lstm_exp, y=lstm_mask)
+            lstm_sum_with_mask = paddle.sum(lstm_exp_with_mask, axis=1)
+            exponent = -1
+            lstm_denominator = paddle.pow(lstm_sum_with_mask, exponent)
+            lstm_denominator = paddle.unsqueeze(lstm_denominator, axis=2)
+            lstm_softmax = paddle.multiply(x=lstm_exp, y=lstm_denominator)
+            lstm_weight = lstm_softmax
+            # softmax replace end
+
+            lstm_scale = paddle.multiply(x=lstm_dropout, y=lstm_weight)
+
+            # 5. sequence_pool's replace start, for it's relevant to sum in time step
+            lstm_scale_with_mask = paddle.multiply(x=lstm_scale, y=lstm_mask)
+            fea_lens = inputs[i][1]
+            fea_len = int(fea_lens[0])
+            lstm_pool = paddle.sum(lstm_scale_with_mask, axis=1)
+            # sequence_pool's replace end
+            att_outs.append(lstm_pool)
+        att_out = paddle.concat(att_outs, axis=1)
+        fc_out1 = self.fc_out1(att_out)
+        fc_out1_act = self.relu(fc_out1)
+        fc_out2 = self.fc_out2(fc_out1_act)
+        fc_out2_act = paddle.tanh(fc_out2)
+        fc_logit = self.fc_logit(fc_out2_act)
+        output = self.sigmoid(fc_logit)
+        return fc_logit, output
+
+    def loss(self, lstm_logit, labels, **kwargs):
+        labels.stop_gradient = True
+        losses = dict()
+        bce_logit_loss = paddle.nn.BCEWithLogitsLoss(reduction='sum')
+        sum_cost = bce_logit_loss(lstm_logit, labels)
+        return sum_cost
+
+    def metric(self, lstm_output, labels):
+        pred = lstm_output.numpy()
+        label = labels.numpy()
+        hit_at_one = youtube8m_metrics.calculate_hit_at_one(pred, label)
+        perr = youtube8m_metrics.calculate_precision_at_equal_recall_rate(
+            pred, label)
+        gap = youtube8m_metrics.calculate_gap(pred, label)
+        return hit_at_one, perr, gap
+
+
+@HEADS.register()
+class ActionAttentionLstmHead(BaseHead):
+    """AttentionLstmHead for FootballAction
+    Args: TODO
+    """
+    def __init__(self,
+                 num_classes=8,
+                 feature_num=2,
+                 feature_dims=[2048, 1024],
+                 embedding_size=512,
+                 lstm_size=1024,
+                 in_channels=2048,
+                 loss_cfg=dict(name='CrossEntropyLoss')):
+        super(ActionAttentionLstmHead, self).__init__(num_classes, in_channels,
+                                                      loss_cfg)
+        self.num_classes = num_classes
+        self.feature_dims = feature_dims
+        self.embedding_size = embedding_size
+        self.lstm_size = lstm_size
+        self.feature_num = len(self.feature_dims)
+        for i in range(self.feature_num):  # 0:rgb, 1:audio
+            bi_lstm = paddle.nn.LSTM(input_size=self.feature_dims[i],
+                                     hidden_size=self.feature_dims[i],
+                                     direction="bidirectional")
+            self.add_sublayer("bi_lstm{}".format(i), bi_lstm)
+
+            drop_rate = 0.5
+            self.dropout = paddle.nn.Dropout(drop_rate)
+
+            att_fc = paddle.nn.Linear(in_features=self.feature_dims[i] * 2,
+                                      out_features=1)
+            self.add_sublayer("att_fc{}".format(i), att_fc)
+            self.softmax = paddle.nn.Softmax()
+
+        self.fc1 = paddle.nn.Linear(in_features=2 * sum(self.feature_dims),
+                                    out_features=8192,
+                                    bias_attr=ParamAttr(
+                                        regularizer=L2Decay(0.0),
+                                        initializer=Normal()))
+        self.bn1 = paddle.nn.BatchNorm(num_channels=8192)
+        self.dropout1 = paddle.nn.Dropout(0.5)
+        self.fc2 = paddle.nn.Linear(in_features=8192,
+                                    out_features=4096,
+                                    bias_attr=ParamAttr(
+                                        regularizer=L2Decay(0.0),
+                                        initializer=Normal()))
+        self.bn2 = paddle.nn.BatchNorm(num_channels=4096)
+        self.dropout2 = paddle.nn.Dropout(0.5)
+        self.fc3 = paddle.nn.Linear(
+            in_features=4096,
+            out_features=self.num_classes,
+        )
+        self.fc4 = paddle.nn.Linear(
+            in_features=4096,
+            out_features=1,
+        )
+
+    def init_weights(self):
+        pass
+
+    def forward(self, inputs):
+        # inputs = [(rgb_data, rgb_len, rgb_mask), (audio_data, audio_len, audio_mask)]
+        # deal with features with different length
+        # 1. padding to same lenght, make a tensor
+        # 2. make a mask tensor with the same shpae with 1
+        # 3. compute output using mask tensor, s.t. output is nothing todo with padding
+        assert (len(inputs) == self.feature_num
+                ), "Input tensor does not contain {} features".format(
+                    self.feature_num)
+        att_outs = []
+        for i in range(len(inputs)):
+            m = getattr(self, "bi_lstm{}".format(i))
+            lstm_out, _ = m(inputs=inputs[i][0], sequence_length=inputs[i][1])
+
+            lstm_dropout = self.dropout(lstm_out)
+
+            # 3. att_fc
+            m = getattr(self, "att_fc{}".format(i))
+            lstm_weight = m(lstm_dropout)
+
+            # 4. softmax replace start, for it's relevant to sum in time step
+            lstm_exp = paddle.exp(lstm_weight)
+            lstm_mask = paddle.mean(inputs[i][2], axis=2)
+            lstm_mask = paddle.unsqueeze(lstm_mask, axis=2)
+            lstm_exp_with_mask = paddle.multiply(x=lstm_exp, y=lstm_mask)
+            lstm_sum_with_mask = paddle.sum(lstm_exp_with_mask, axis=1)
+            exponent = -1
+            lstm_denominator = paddle.pow(lstm_sum_with_mask, exponent)
+            lstm_denominator = paddle.unsqueeze(lstm_denominator, axis=2)
+            lstm_softmax = paddle.multiply(x=lstm_exp, y=lstm_denominator)
+            lstm_weight = lstm_softmax
+            # softmax replace end
+
+            lstm_scale = paddle.multiply(x=lstm_dropout, y=lstm_weight)
+
+            # 5. sequence_pool's replace start, for it's relevant to sum in time step
+            lstm_scale_with_mask = paddle.multiply(x=lstm_scale, y=lstm_mask)
+            # fea_lens = inputs[i][1]
+            # fea_len = int(fea_lens[0])
+            lstm_pool = paddle.sum(lstm_scale_with_mask, axis=1)
+            # sequence_pool's replace end
+            att_outs.append(lstm_pool)
+        att_out = paddle.concat(att_outs, axis=1)
+        y = self.fc1(att_out)
+        y = self.bn1(y)
+        y = F.relu(y)
+        y = self.dropout1(y)
+        y = self.fc2(y)
+        y = self.bn2(y)
+        y = F.relu(y)
+        y = self.dropout2(y)
+        out1 = self.fc3(y)
+        out1 = F.softmax(out1)
+        out2 = self.fc4(y)
+        out2 = F.sigmoid(out2)
+        return out1, out2
+
+    def loss(self, logits, iou, labels, labels_iou, **kwargs):
+        alpha = 10
+        softmax_loss = F.cross_entropy(logits, labels)
+        labels_iou = labels_iou.astype('float32')
+        mse_loss = paddle.sum(F.square_error_cost(iou, labels_iou), axis=-1)
+        sum_loss = softmax_loss + alpha * mse_loss
+        return sum_loss
+
+    def metric(self, scores, labels):
+        top1 = paddle.metric.accuracy(input=scores, label=labels, k=1)
+        top5 = paddle.metric.accuracy(input=scores, label=labels, k=5)
+        return top1, top5
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/base.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/base.py
new file mode 100644
index 0000000..99a1408
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/base.py
@@ -0,0 +1,178 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from abc import abstractmethod
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ..builder import build_loss
+from paddlevideo.utils import get_logger, get_dist_info
+
+logger = get_logger("paddlevideo")
+
+
+class BaseHead(nn.Layer):
+    """Base class for head part.
+
+    All head should subclass it.
+    All subclass should overwrite:
+
+    - Methods: ```init_weights```, initializing weights.
+    - Methods: ```forward```, forward function.
+
+    Args:
+        num_classes (int): The number of classes to be classified.
+        in_channels (int): The number of channels in input feature.
+        loss_cfg (dict): Config for building loss. Default: dict(type='CrossEntropyLoss').
+        ls_eps (float): label smoothing epsilon. Default: 0. .
+
+    """
+    def __init__(
+        self,
+        num_classes=None,
+        in_channels=None,
+        loss_cfg=dict(
+            name="CrossEntropyLoss"
+        ),  #TODO(shipping): only pass a name or standard build cfg format.
+        #multi_class=False, NOTE(shipping): not supported now.
+        ls_eps=0.):
+
+        super().__init__()
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.loss_func = build_loss(loss_cfg)
+        #self.multi_class = multi_class NOTE(shipping): not supported now
+        self.ls_eps = ls_eps
+
+    @abstractmethod
+    def forward(self, x):
+        """Define how the head is going to run.
+        """
+        raise NotImplemented
+
+    def loss(self, scores, labels, valid_mode=False, if_top5=True, **kwargs):
+        """Calculate the loss accroding to the model output ```scores```,
+           and the target ```labels```.
+
+        Args:
+            scores (paddle.Tensor): The output of the model.
+            labels (paddle.Tensor): The target output of the model.
+
+        Returns:
+            losses (dict): A dict containing field 'loss'(mandatory) and 'top1_acc', 'top5_acc'(optional).
+
+        """
+        if len(labels) == 1:  #commonly case
+            labels = labels[0]
+            losses = dict()
+            if self.ls_eps != 0. and not valid_mode:  # label_smooth
+                loss = self.label_smooth_loss(scores, labels, **kwargs)
+            else:
+                loss = self.loss_func(scores, labels, **kwargs)
+            if if_top5:
+                top1, top5 = self.get_acc(scores, labels, valid_mode)
+                losses['top1'] = top1
+                losses['top5'] = top5
+                losses['loss'] = loss
+            else:
+                top1 = self.get_acc(scores, labels, valid_mode, if_top5)
+                losses['top1'] = top1
+                losses['loss'] = loss
+            return losses
+        # MRI目前二分类无top5
+        elif len(labels) == 3:  # mix_up
+            labels_a, labels_b, lam = labels
+            lam = lam[0]  # get lam value
+            losses = dict()
+            if self.ls_eps != 0:
+                loss_a = self.label_smooth_loss(scores, labels_a, **kwargs)
+                loss_b = self.label_smooth_loss(scores, labels_b, **kwargs)
+            else:
+                loss_a = self.loss_func(scores, labels_a, **kwargs)
+                loss_b = self.loss_func(scores, labels_b, **kwargs)
+            loss = lam * loss_a + (1 - lam) * loss_b
+
+            if if_top5:
+                top1a, top5a = self.get_acc(scores, labels_a, valid_mode)
+                top1b, top5b = self.get_acc(scores, labels_b, valid_mode)
+                top1 = lam * top1a + (1 - lam) * top1b
+                top5 = lam * top5a + (1 - lam) * top5b
+                losses['top1'] = top1
+                losses['top5'] = top5
+                losses['loss'] = loss
+
+            else:
+                top1a = self.get_acc(scores, labels_a, valid_mode, if_top5)
+                top1b = self.get_acc(scores, labels_b, valid_mode, if_top5)
+                top1 = lam * top1a + (1 - lam) * top1b
+                losses['top1'] = top1
+                losses['loss'] = loss
+
+            return losses
+        else:
+            raise NotImplemented
+
+    def label_smooth_loss(self, scores, labels, **kwargs):
+        """
+        Args:
+            scores (paddle.Tensor): [N, num_classes]
+            labels (paddle.Tensor): [N, ]
+        Returns:
+            paddle.Tensor: [1,]
+        """
+        if paddle.is_compiled_with_custom_device('npu'):
+            """
+            Designed for the lack of temporary operators of NPU,
+            main idea is to split smooth loss into uniform distribution loss
+            and hard label calculation
+            """
+            hard_loss = (1.0 - self.ls_eps) * F.cross_entropy(scores, labels)
+            uniform_loss = (self.ls_eps / self.num_classes) * (
+                -F.log_softmax(scores, -1).sum(-1).mean(0))
+            loss = hard_loss + uniform_loss
+        else:
+            labels = F.one_hot(labels, self.num_classes)
+            labels = F.label_smooth(labels, epsilon=self.ls_eps)
+            labels = paddle.squeeze(labels, axis=1)
+            loss = self.loss_func(scores, labels, soft_label=True, **kwargs)
+        return loss
+
+    def get_acc(self, scores, labels, valid_mode, if_top5=True):
+        if if_top5:
+            top1 = paddle.metric.accuracy(input=scores, label=labels, k=1)
+            top5 = paddle.metric.accuracy(input=scores, label=labels, k=5)
+            _, world_size = get_dist_info()
+            #NOTE(shipping): deal with multi cards validate
+            if world_size > 1 and valid_mode:  #reduce sum when valid
+                paddle.distributed.all_reduce(
+                    top1, op=paddle.distributed.ReduceOp.SUM)
+                top1 = top1 / world_size
+                paddle.distributed.all_reduce(
+                    top5, op=paddle.distributed.ReduceOp.SUM)
+                top5 = top5 / world_size
+
+            return top1, top5
+        else:
+            top1 = paddle.metric.accuracy(input=scores, label=labels, k=1)
+            _, world_size = get_dist_info()
+            #NOTE(shipping): deal with multi cards validate
+            if world_size > 1 and valid_mode:  #reduce sum when valid
+                paddle.distributed.all_reduce(
+                    top1, op=paddle.distributed.ReduceOp.SUM)
+                top1 = top1 / world_size
+
+            return top1
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/bbox_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/bbox_head.py
new file mode 100644
index 0000000..688251e
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/bbox_head.py
@@ -0,0 +1,225 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle 
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+from .. import builder
+
+from ..registry import HEADS
+
+@HEADS.register()
+class BBoxHeadAVA(nn.Layer):
+    """Simplest RoI head, with only two fc layers for classification and
+    regression respectively.  """
+
+    def __init__(
+            self,
+            temporal_pool_type='avg',
+            spatial_pool_type='max',
+            in_channels=2048,
+            num_classes=81,# The first class is reserved, to classify bbox as pos / neg
+            dropout_ratio=0,
+            dropout_before_pool=True,
+            topk=(3, 5),
+            multilabel=True):
+
+        super(BBoxHeadAVA, self).__init__()
+        assert temporal_pool_type in ['max', 'avg']
+        assert spatial_pool_type in ['max', 'avg']
+        self.temporal_pool_type = temporal_pool_type
+        self.spatial_pool_type = spatial_pool_type
+
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+
+        self.dropout_ratio = dropout_ratio
+        self.dropout_before_pool = dropout_before_pool
+
+        self.multilabel = multilabel
+        if topk is None:
+            self.topk = ()
+        elif isinstance(topk, int):
+            self.topk = (topk, )
+        elif isinstance(topk, tuple):
+            assert all([isinstance(k, int) for k in topk])
+            self.topk = topk
+        else:
+            raise TypeError('topk should be int or tuple[int], '
+                            f'but get {type(topk)}')
+        # Class 0 is ignored when calculaing multilabel accuracy,
+        # so topk cannot be equal to num_classes
+        assert all([k < num_classes for k in self.topk])
+        assert self.multilabel
+
+        in_channels = self.in_channels
+        if self.temporal_pool_type == 'avg':
+            self.temporal_pool = nn.AdaptiveAvgPool3D((1, None, None))
+        else:
+            self.temporal_pool = nn.AdaptiveMaxPool3D((1, None, None))
+        if self.spatial_pool_type == 'avg':
+            self.spatial_pool = nn.AdaptiveAvgPool3D((None, 1, 1))
+        else:
+            self.spatial_pool = nn.AdaptiveMaxPool3D((None, 1, 1))
+
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout(dropout_ratio)
+
+        weight_attr = paddle.framework.ParamAttr(name="weight",
+                                                 initializer=paddle.nn.initializer.Normal(mean=0.0, std=0.01))
+        bias_attr = paddle.ParamAttr(name="bias",
+                                     initializer=paddle.nn.initializer.Constant(value=0.0))
+
+        self.fc_cls = nn.Linear(in_channels, num_classes, weight_attr=weight_attr, bias_attr=bias_attr)
+
+        self.debug_imgs = None
+
+    def forward(self, x,rois, rois_num):
+        roi = paddle.concat(rois)
+        roi_x1 = paddle.index_select(roi, index=paddle.to_tensor(0), axis=1)
+        roi_x2 = paddle.index_select(roi, index=paddle.to_tensor(2), axis=1)
+        roi_w = roi_x2 - roi_x1
+        roi_y1 = paddle.index_select(roi, index=paddle.to_tensor(1), axis=1)
+        roi_y2 = paddle.index_select(roi, index=paddle.to_tensor(3), axis=1)
+        roi_h = roi_y2 - roi_y1
+        roi_area = paddle.multiply(roi_w, roi_h)
+        A = roi_area
+        A1 = paddle.full(A.shape, 1, dtype='int32')
+        A2 = paddle.where(A == 0, paddle.zeros_like(A1), A1)
+        AE = paddle.expand(A2, [A.shape[0], x.shape[1]])
+        rois_num = paddle.to_tensor(rois_num, dtype='int32')
+        if self.dropout_before_pool and self.dropout_ratio > 0 :
+            x = self.dropout(x)
+        x = self.temporal_pool(x)
+        x = self.spatial_pool(x)
+        if not self.dropout_before_pool and self.dropout_ratio > 0 :
+            x = self.dropout(x)
+        x = paddle.reshape(x, [x.shape[0], -1])
+        x = paddle.multiply(x, paddle.cast(AE,"float32"))
+        cls_score = self.fc_cls(x)
+        # We do not predict bbox, so return None
+        return cls_score, None
+
+    def get_targets(self, sampling_results, gt_bboxes, gt_labels, pos_weight):
+        pos_proposals = [res.pos_bboxes for res in sampling_results]
+        neg_proposals = [res.neg_bboxes for res in sampling_results]
+        pos_gt_labels = [res.pos_gt_labels for res in sampling_results]
+        cls_reg_targets = self.bbox_target(pos_proposals, neg_proposals,
+                                      pos_gt_labels, pos_weight)
+        return cls_reg_targets
+
+    def bbox_target(self, pos_bboxes_list, neg_bboxes_list, gt_labels, pos_weight):
+        """Generate classification targets for bboxes.  """
+        labels, label_weights = [], []
+        pos_weight = 1.0 if pos_weight <= 0 else pos_weight
+    
+        assert len(pos_bboxes_list) == len(neg_bboxes_list) == len(gt_labels)
+        length = len(pos_bboxes_list)
+    
+        for i in range(length):
+            pos_bboxes = pos_bboxes_list[i]
+            neg_bboxes = neg_bboxes_list[i]
+            gt_label = gt_labels[i]
+            num_pos = pos_bboxes.shape[0]
+            if neg_bboxes is not None:
+                num_neg = neg_bboxes.shape[0]
+            else:
+                num_neg = 0
+            num_samples = num_pos + num_neg
+            neg_label = paddle.zeros([num_neg, gt_label.shape[1]])
+            label = paddle.concat([gt_label,neg_label])
+            labels.append(label)
+    
+        labels = paddle.concat(labels, 0)
+        return labels
+
+    def recall_prec(self, pred_vec, target_vec):
+        correct = paddle.to_tensor(np.logical_and(pred_vec.numpy(), target_vec.numpy()))
+        correct = paddle.where(correct, 
+                                    paddle.full(correct.shape,1,dtype='int32'),
+                                    paddle.full(correct.shape,0,dtype='int32'))
+        recall_correct = paddle.cast(paddle.sum(correct, axis=1), 'float32')
+        target_vec = paddle.where(target_vec, 
+                                    paddle.full(target_vec.shape,1,dtype='int32'),
+                                    paddle.full(target_vec.shape,0,dtype='int32'))
+        recall_target = paddle.cast(paddle.sum(target_vec, axis=1),'float32')
+        recall = recall_correct / recall_target
+        pred_vec = paddle.where(pred_vec, 
+                                    paddle.full(pred_vec.shape,1,dtype='int32'),
+                                    paddle.full(pred_vec.shape,0,dtype='int32'))
+        prec_target = paddle.cast(paddle.sum(pred_vec, axis=1) + 1e-6, 'float32')
+        prec = recall_correct / prec_target
+        recall_mean = paddle.mean(recall)
+        prec_mean = paddle.mean(prec)
+        return recall_mean, prec_mean
+
+    def multilabel_accuracy(self, pred, target, thr=0.5):
+        pred = paddle.nn.functional.sigmoid(pred)
+        pred_vec = pred > thr
+        target_vec = target > 0.5
+        recall_thr, prec_thr = self.recall_prec(pred_vec, target_vec)
+        recalls, precs = [], []
+        for k in self.topk:
+            _, pred_label = paddle.topk(pred, k, 1, True, True)
+            pred_vec = paddle.full(pred.shape,0,dtype='bool')
+            num_sample = pred.shape[0]
+            for i in range(num_sample):
+                pred_vec[i, pred_label[i].numpy()] = 1  
+            recall_k, prec_k = self.recall_prec(pred_vec, target_vec)
+            recalls.append(recall_k)
+            precs.append(prec_k)
+        return recall_thr, prec_thr, recalls, precs
+
+    def loss(self,
+             cls_score,
+             labels):
+        losses = dict()
+        if cls_score is not None:
+            # Only use the cls_score
+            labels = labels[:, 1:]
+            pos_inds_bool = paddle.sum(labels, axis=-1) > 0
+            pos_inds = paddle.where(paddle.sum(labels, axis=-1) > 0,
+                                    paddle.full([labels.shape[0]],1,dtype='int32'),
+                                    paddle.full([labels.shape[0]],0,dtype='int32'))
+            pos_inds = paddle.nonzero(pos_inds, as_tuple=False)
+            cls_score = paddle.index_select(cls_score, pos_inds, axis=0)
+            cls_score = cls_score[:, 1:] 
+            labels = paddle.index_select(labels, pos_inds, axis=0)
+            bce_loss = F.binary_cross_entropy_with_logits
+            loss = bce_loss(cls_score, labels, reduction='none')
+            losses['loss'] = paddle.mean(loss)
+            recall_thr, prec_thr, recall_k, prec_k = self.multilabel_accuracy(
+                cls_score, labels, thr=0.5)
+            losses['recall@thr=0.5'] = recall_thr
+            losses['prec@thr=0.5'] = prec_thr
+            for i, k in enumerate(self.topk):
+                losses[f'recall@top{k}'] = recall_k[i]
+                losses[f'prec@top{k}'] = prec_k[i]
+        return losses
+
+    def get_det_bboxes(self,
+                       rois,
+                       cls_score,
+                       img_shape,
+                       flip=False,
+                       crop_quadruple=None,
+                       cfg=None):
+        if isinstance(cls_score, list):
+            cls_score = sum(cls_score) / float(len(cls_score))
+        assert self.multilabel
+        m = paddle.nn.Sigmoid()
+        scores = m(cls_score)
+        bboxes = rois
+        return bboxes, scores
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/cfbi_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/cfbi_head.py
new file mode 100644
index 0000000..f7cbd91
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/cfbi_head.py
@@ -0,0 +1,448 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+
+
+class IA_gate(nn.Layer):
+    def __init__(self, in_dim, out_dim):
+        super(IA_gate, self).__init__()
+        self.IA = nn.Linear(in_dim, out_dim)
+
+    def forward(self, x, IA_head):
+        a = self.IA(IA_head)
+        a = 1. + paddle.tanh(a)
+        a = paddle.unsqueeze(paddle.unsqueeze(a, axis=-1), axis=-1)
+        x = a * x
+        return x
+
+
+class GCT(nn.Layer):
+    def __init__(self, num_channels, epsilon=1e-5, mode='l2', after_relu=False):
+        super(GCT, self).__init__()
+        x1 = paddle.zeros([1, num_channels, 1, 1])
+        x2 = paddle.ones([1, num_channels, 1, 1])
+        self.alpha = paddle.create_parameter(
+            shape=x2.shape,
+            dtype=x2.dtype,
+            default_initializer=nn.initializer.Assign(x2))
+        self.alpha.stop_gradient = False
+        self.gamma = paddle.create_parameter(
+            shape=x1.shape,
+            dtype=x1.dtype,
+            default_initializer=nn.initializer.Assign(x1))
+        self.gamma.stop_gradient = False
+        self.beta = paddle.create_parameter(
+            shape=x1.shape,
+            dtype=x1.dtype,
+            default_initializer=nn.initializer.Assign(x1))
+        self.beta.stop_gradient = False
+
+        self.epsilon = epsilon
+        self.mode = mode
+        self.after_relu = after_relu
+
+    def forward(self, x):
+
+        if self.mode == 'l2':
+            embedding = paddle.pow(
+                paddle.sum(paddle.pow(x, 2), axis=[2, 3], keepdim=True) +
+                self.epsilon, 0.5) * self.alpha
+            norm = self.gamma / paddle.pow(
+                (paddle.mean(paddle.pow(embedding, 2), axis=1, keepdim=True) +
+                 self.epsilon), 0.5)
+        elif self.mode == 'l1':
+            if not self.after_relu:
+                _x = paddle.abs(x)
+            else:
+                _x = x
+            embedding = paddle.sum(_x, axis=(2, 3), keepdim=True) * self.alpha
+            norm = self.gamma / (paddle.mean(
+                paddle.abs(embedding), axis=1, keepdim=True) + self.epsilon)
+        else:
+            print('Unknown mode!')
+            exit()
+
+        gate = 1. + paddle.tanh(embedding * norm + self.beta)
+
+        return x * gate
+
+
+class Bottleneck(nn.Layer):
+    def __init__(self, inplanes, outplanes, stride=1, dilation=1):
+        super(Bottleneck, self).__init__()
+        expansion = 4
+        planes = int(outplanes / expansion)
+
+        self.GCT1 = GCT(inplanes)
+        self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)
+        self.bn1 = nn.GroupNorm(num_groups=32, num_channels=planes)
+
+        self.conv2 = nn.Conv2D(planes,
+                               planes,
+                               kernel_size=3,
+                               stride=stride,
+                               dilation=dilation,
+                               padding=dilation,
+                               bias_attr=False)
+        self.bn2 = nn.GroupNorm(num_groups=32, num_channels=planes)
+
+        self.conv3 = nn.Conv2D(planes,
+                               planes * expansion,
+                               kernel_size=1,
+                               bias_attr=False)
+        self.bn3 = nn.GroupNorm(num_groups=32, num_channels=planes * expansion)
+        self.relu = nn.ReLU()
+        if stride != 1 or inplanes != planes * expansion:
+            downsample = nn.Sequential(
+                nn.Conv2D(inplanes,
+                          planes * expansion,
+                          kernel_size=1,
+                          stride=stride,
+                          bias_attr=False),
+                nn.GroupNorm(num_groups=32, num_channels=planes * expansion),
+            )
+        else:
+            downsample = None
+        self.downsample = downsample
+
+        self.stride = stride
+        self.dilation = dilation
+
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                nn.initializer.KaimingNormal()
+
+    def forward(self, x):
+        residual = x
+
+        out = self.GCT1(x)
+        out = self.conv1(out)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class _ASPPModule(nn.Layer):
+    def __init__(self, inplanes, planes, kernel_size, padding, dilation):
+        super(_ASPPModule, self).__init__()
+        self.GCT = GCT(inplanes)
+        self.atrous_conv = nn.Conv2D(inplanes,
+                                     planes,
+                                     kernel_size=kernel_size,
+                                     stride=1,
+                                     padding=padding,
+                                     dilation=dilation,
+                                     bias_attr=False)
+        self.bn = nn.GroupNorm(num_groups=int(planes / 4), num_channels=planes)
+        self.relu = nn.ReLU()
+
+        self._init_weight()
+
+    def forward(self, x):
+        x = self.GCT(x)
+        x = self.atrous_conv(x)
+        x = self.bn(x)
+
+        return self.relu(x)
+
+    def _init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                nn.initializer.KaimingNormal()
+            elif isinstance(m, nn.GroupNorm):
+                m.weight.data = nn.initializer.Constant(1)
+                m.bias.data = nn.initializer.Constant(0)
+
+
+class ASPP(nn.Layer):
+    def __init__(self):
+        super(ASPP, self).__init__()
+
+        inplanes = 512
+        dilations = [1, 6, 12, 18]
+
+        self.aspp1 = _ASPPModule(inplanes,
+                                 128,
+                                 1,
+                                 padding=0,
+                                 dilation=dilations[0])
+        self.aspp2 = _ASPPModule(inplanes,
+                                 128,
+                                 3,
+                                 padding=dilations[1],
+                                 dilation=dilations[1])
+        self.aspp3 = _ASPPModule(inplanes,
+                                 128,
+                                 3,
+                                 padding=dilations[2],
+                                 dilation=dilations[2])
+        self.aspp4 = _ASPPModule(inplanes,
+                                 128,
+                                 3,
+                                 padding=dilations[3],
+                                 dilation=dilations[3])
+
+        self.global_avg_pool = nn.Sequential(
+            nn.AdaptiveAvgPool2D((1, 1)),
+            nn.Conv2D(inplanes, 128, 1, stride=1, bias_attr=False), nn.ReLU())
+
+        self.GCT = GCT(640)
+        self.conv1 = nn.Conv2D(640, 256, 1, bias_attr=False)
+        self.bn1 = nn.GroupNorm(num_groups=32, num_channels=256)
+        self.relu = nn.ReLU()
+        self._init_weight()
+
+    def forward(self, x):
+        x1 = self.aspp1(x)
+        x2 = self.aspp2(x)
+        x3 = self.aspp3(x)
+        x4 = self.aspp4(x)
+        x5 = self.global_avg_pool(x)
+        x5 = F.interpolate(x5,
+                           size=x4.shape[2:],
+                           mode='bilinear',
+                           align_corners=True)
+        x = paddle.concat([x1, x2, x3, x4, x5], axis=1)
+
+        x = self.GCT(x)
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        return x
+
+    def _init_weight(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                nn.initializer.KaimingNormal()
+            elif isinstance(m, nn.GroupNorm):
+                m.weight.data = nn.initializer.Constant(1)
+                m.bias.data = nn.initializer.Constant(0)
+
+
+@HEADS.register()
+class CollaborativeEnsemblerMS(nn.Layer):
+    def __init__(
+        self,
+        model_semantic_embedding_dim=256,
+        model_multi_local_distance=[[4, 8, 12, 16, 20, 24],
+                                    [2, 4, 6, 8, 10, 12], [2, 4, 6, 8, 10]],
+        model_head_embedding_dim=256,
+        model_refine_channels=64,
+        model_low_level_inplanes=256,
+    ):
+        super(CollaborativeEnsemblerMS, self).__init__()
+        in_dim_4x = model_semantic_embedding_dim * 3 + 3 + 2 * len(
+            model_multi_local_distance[0])
+        in_dim_8x = model_semantic_embedding_dim * 3 + 3 + 2 * len(
+            model_multi_local_distance[1])
+        in_dim_16x = model_semantic_embedding_dim * 3 + 3 + 2 * len(
+            model_multi_local_distance[2])
+        attention_dim = model_semantic_embedding_dim * 4
+        embed_dim = model_head_embedding_dim
+        refine_dim = model_refine_channels
+        low_level_dim = model_low_level_inplanes
+
+        IA_in_dim = attention_dim
+
+        self.relu = nn.ReLU()
+
+        # stage 1
+
+        self.S1_IA1 = IA_gate(IA_in_dim, in_dim_4x)
+        self.S1_layer1 = Bottleneck(in_dim_4x, embed_dim)
+
+        self.S1_IA2 = IA_gate(IA_in_dim, embed_dim)
+        self.S1_layer2 = Bottleneck(embed_dim, embed_dim, 1, 2)
+
+        # stage2
+        self.S2_IA1 = IA_gate(IA_in_dim, embed_dim)
+        self.S2_layer1 = Bottleneck(embed_dim, embed_dim * 2, 2)
+
+        self.S2_IA2 = IA_gate(IA_in_dim, embed_dim * 2 + in_dim_8x)
+        self.S2_layer2 = Bottleneck(embed_dim * 2 + in_dim_8x, embed_dim * 2, 1,
+                                    2)
+
+        self.S2_IA3 = IA_gate(IA_in_dim, embed_dim * 2)
+        self.S2_layer3 = Bottleneck(embed_dim * 2, embed_dim * 2, 1, 4)
+
+        # stage3
+        self.S3_IA1 = IA_gate(IA_in_dim, embed_dim * 2)
+        self.S3_layer1 = Bottleneck(embed_dim * 2, embed_dim * 2, 2)
+
+        self.S3_IA2 = IA_gate(IA_in_dim, embed_dim * 2 + in_dim_16x)
+        self.S3_layer2 = Bottleneck(embed_dim * 2 + in_dim_16x, embed_dim * 2,
+                                    1, 2)
+
+        self.S3_IA3 = IA_gate(IA_in_dim, embed_dim * 2)
+        self.S3_layer3 = Bottleneck(embed_dim * 2, embed_dim * 2, 1, 4)
+
+        self.ASPP_IA = IA_gate(IA_in_dim, embed_dim * 2)
+        self.ASPP = ASPP()
+
+        # Decoder
+        self.GCT_sc = GCT(low_level_dim + embed_dim)
+        self.conv_sc = nn.Conv2D(low_level_dim + embed_dim,
+                                 refine_dim,
+                                 1,
+                                 bias_attr=False)
+        self.bn_sc = nn.GroupNorm(num_groups=int(refine_dim / 4),
+                                  num_channels=refine_dim)
+        self.relu = nn.ReLU()
+
+        self.IA10 = IA_gate(IA_in_dim, embed_dim + refine_dim)
+        self.conv1 = nn.Conv2D(embed_dim + refine_dim,
+                               int(embed_dim / 2),
+                               kernel_size=3,
+                               padding=1,
+                               bias_attr=False)
+        self.bn1 = nn.GroupNorm(num_groups=32, num_channels=int(embed_dim / 2))
+
+        self.IA11 = IA_gate(IA_in_dim, int(embed_dim / 2))
+        self.conv2 = nn.Conv2D(int(embed_dim / 2),
+                               int(embed_dim / 2),
+                               kernel_size=3,
+                               padding=1,
+                               bias_attr=False)
+        self.bn2 = nn.GroupNorm(num_groups=32, num_channels=int(embed_dim / 2))
+
+        # Output
+        self.IA_final_fg = nn.Linear(IA_in_dim, int(embed_dim / 2) + 1)
+        self.IA_final_bg = nn.Linear(IA_in_dim, int(embed_dim / 2) + 1)
+
+        self.conv_sc.weight.data = nn.initializer.KaimingNormal()
+        self.conv1.weight.data = nn.initializer.KaimingNormal()
+        self.conv2.weight.data = nn.initializer.KaimingNormal()
+
+    def forward(self, all_x, all_IA_head=None, low_level_feat=None):
+        x_4x, x_8x, x_16x = all_x
+        IA_head = all_IA_head[0]
+
+        # stage 1
+        x = self.S1_IA1(x_4x, IA_head)
+        x = self.S1_layer1(x)
+
+        x = self.S1_IA2(x, IA_head)
+        x = self.S1_layer2(x)
+
+        low_level_feat = paddle.concat(
+            [paddle.expand(low_level_feat, [x.shape[0], -1, -1, -1]), x],
+            axis=1)
+
+        # stage 2
+        x = self.S2_IA1(x, IA_head)
+        x = self.S2_layer1(x)
+
+        x = paddle.concat([x, x_8x], axis=1)
+        x = self.S2_IA2(x, IA_head)
+        x = self.S2_layer2(x)
+
+        x = self.S2_IA3(x, IA_head)
+        x = self.S2_layer3(x)
+
+        # stage 3
+        x = self.S3_IA1(x, IA_head)
+        x = self.S3_layer1(x)
+
+        x = paddle.concat([x, x_16x], axis=1)
+        x = self.S3_IA2(x, IA_head)
+        x = self.S3_layer2(x)
+
+        x = self.S3_IA3(x, IA_head)
+        x = self.S3_layer3(x)
+
+        # ASPP + Decoder
+        x = self.ASPP_IA(x, IA_head)
+        x = self.ASPP(x)
+
+        x = self.decoder(x, low_level_feat, IA_head)
+
+        fg_logit = self.IA_logit(x, IA_head, self.IA_final_fg)
+        bg_logit = self.IA_logit(x, IA_head, self.IA_final_bg)
+
+        pred = self.augment_background_logit(fg_logit, bg_logit)
+
+        return pred
+
+    def IA_logit(self, x, IA_head, IA_final):
+        n, c, h, w = x.shape
+        x = paddle.reshape(x, [1, n * c, h, w])
+        IA_output = IA_final(IA_head)
+        IA_weight = IA_output[:, :c]
+        IA_bias = IA_output[:, -1]
+        IA_weight = paddle.reshape(IA_weight, [n, c, 1, 1])
+
+        IA_bias = paddle.reshape(IA_bias, [-1])
+        logit = paddle.reshape(
+            F.conv2d(x, weight=IA_weight, bias=IA_bias, groups=n), [n, 1, h, w])
+        return logit
+
+    def decoder(self, x, low_level_feat, IA_head):
+        x = F.interpolate(x,
+                          size=low_level_feat.shape[2:],
+                          mode='bicubic',
+                          align_corners=True)
+
+        low_level_feat = self.GCT_sc(low_level_feat)
+        low_level_feat = self.conv_sc(low_level_feat)
+        low_level_feat = self.bn_sc(low_level_feat)
+        low_level_feat = self.relu(low_level_feat)
+
+        x = paddle.concat([x, low_level_feat], axis=1)
+        x = self.IA10(x, IA_head)
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        x = self.IA11(x, IA_head)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+
+        return x
+
+    def augment_background_logit(self, fg_logit, bg_logit):
+        #  We augment the logit of absolute background by using the relative background logit of all the
+        #  foreground objects.
+        obj_num = fg_logit.shape[0]
+        pred = fg_logit
+        if obj_num > 1:
+            bg_logit = bg_logit[1:obj_num, :, :, :]
+            aug_bg_logit = paddle.min(bg_logit, axis=0, keepdim=True)
+            pad = paddle.expand(paddle.zeros(aug_bg_logit.shape),
+                                [obj_num - 1, -1, -1, -1])
+            aug_bg_logit = paddle.concat([aug_bg_logit, pad], axis=0)
+            pred = pred + aug_bg_logit
+        pred = paddle.transpose(pred, [1, 0, 2, 3])
+        return pred
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/ctrgcn_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/ctrgcn_head.py
new file mode 100644
index 0000000..c551d0d
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/ctrgcn_head.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn as nn
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class CTRGCNHead(BaseHead):
+    """
+    Head for CTR-GCN model.
+    Args:
+        in_channels: int, input feature channels. Default: 64.
+        num_classes: int, output the number of classes.
+        drop_out: float, dropout ratio of layer. Default: 0.
+    """
+
+    def __init__(self, in_channels=64, num_classes=10, drop_out=0, **kwargs):
+        super().__init__(num_classes, in_channels, **kwargs)
+        self.in_channels = in_channels
+        self.drop_out = drop_out
+
+        self.fc = nn.Linear(self.in_channels * 4, self.num_classes)
+        if drop_out:
+            self.drop_out = nn.Dropout(self.drop_out)
+        else:
+            self.drop_out = lambda x: x
+
+    def init_weights(self):
+        """Initiate the parameters.
+        """
+        for layer in self.sublayers():
+            if isinstance(layer, nn.Conv2D):
+                weight_init_(layer.weight,
+                             'Normal',
+                             mean=0.0,
+                             std=math.sqrt(2. / self.num_classes))
+
+    def forward(self, output_patch):
+        """Define how the head is going to run.
+        """
+        x, N, M = output_patch
+        # N*M,C,T,V
+        _, c_new, T, V = x.shape
+        x = paddle.reshape(x, shape=[N, M, c_new, T * V])
+        x = x.mean(3).mean(1)
+        x = self.drop_out(x)
+
+        return self.fc(x)
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/i3d_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/i3d_head.py
new file mode 100644
index 0000000..269c818
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/i3d_head.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+
+from ..registry import HEADS
+from ..weight_init import weight_init_
+from .base import BaseHead
+
+
+@HEADS.register()
+class I3DHead(BaseHead):
+    """Classification head for I3D.
+
+    Args:
+        num_classes (int): Number of classes to be classified.
+        in_channels (int): Number of channels in input feature.
+        loss_cls (dict): Config for building loss.
+            Default: dict(name='CrossEntropyLoss')
+        spatial_type (str): Pooling type in spatial dimension. Default: 'avg'.
+        drop_ratio (float): Probability of dropout layer. Default: 0.5.
+        std (float): Std value for Initiation. Default: 0.01.
+        kwargs (dict, optional): Any keyword argument to be used to initialize
+            the head.
+    """
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 loss_cfg=dict(name='CrossEntropyLoss'),
+                 spatial_type='avg',
+                 drop_ratio=0.5,
+                 std=0.01,
+                 **kwargs):
+
+        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)
+
+        self.spatial_type = spatial_type
+        self.drop_ratio = drop_ratio
+        self.stdv = std
+        if self.drop_ratio != 0:
+            self.dropout = nn.Dropout(p=self.drop_ratio)
+        else:
+            self.dropout = None
+        self.fc = nn.Linear(
+            self.in_channels,
+            self.num_classes,
+            weight_attr=ParamAttr(learning_rate=10.0),
+            bias_attr=ParamAttr(learning_rate=10.0),
+        )
+
+        if self.spatial_type == 'avg':
+            # use `nn.AdaptiveAvgPool3d` to adaptively match the in_channels.
+            self.avg_pool = nn.AdaptiveAvgPool3D((1, 1, 1))
+        else:
+            self.avg_pool = None
+
+    def init_weights(self):
+        """Initiate the parameters from scratch."""
+        weight_init_(self.fc, 'Normal', 'fc_0.w_0', 'fc_0.b_0', std=self.stdv)
+
+    def forward(self, x):
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor: The classification scores for input samples.
+        """
+        # [N, in_channels, 4, 7, 7]
+        if self.avg_pool is not None:
+            x = self.avg_pool(x)
+        # [N, in_channels, 1, 1, 1]
+        if self.dropout is not None:
+            x = self.dropout(x)
+        # [N, in_channels, 1, 1, 1]
+        N = paddle.shape(x)[0]
+        x = x.reshape([N, -1])
+        # [N, in_channels]
+        cls_score = self.fc(x)
+        # [N, num_classes]
+        return cls_score
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/movinet_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/movinet_head.py
new file mode 100644
index 0000000..924b014
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/movinet_head.py
@@ -0,0 +1,15 @@
+import collections.abc
+
+container_abcs = collections.abc
+from ..registry import HEADS
+from .base import BaseHead
+from ..builder import build_loss
+
+
+@HEADS.register()
+class MoViNetHead(BaseHead):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, *args):
+        return x
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/ms_tcn_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/ms_tcn_head.py
new file mode 100644
index 0000000..e0f435f
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/ms_tcn_head.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+
+from paddle import ParamAttr
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class MSTCNHead(BaseHead):
+
+    def __init__(self, num_classes, in_channels):
+        super().__init__(num_classes, in_channels)
+        self.ce = nn.CrossEntropyLoss(ignore_index=-100)
+        self.mse = nn.MSELoss(reduction='none')
+        self.num_classes = num_classes
+
+        # cls score
+        self.overlap = 0.5
+
+    def forward(self, x):
+        """MS-TCN no head
+        """
+        return x
+
+    def loss(self, output, video_gt):
+        """calculate loss
+        """
+        output_transpose = paddle.transpose(output, [2, 0, 1])
+        ce_x = paddle.reshape(output_transpose,
+                              (output_transpose.shape[0] *
+                               output_transpose.shape[1], self.num_classes))
+        ce_y = video_gt[0, :]
+        ce_loss = self.ce(ce_x, ce_y)
+        loss = ce_loss
+
+        mse = self.mse(F.log_softmax(output[:, :, 1:], axis=1),
+                       F.log_softmax(output.detach()[:, :, :-1], axis=1))
+        mse = paddle.clip(mse, min=0, max=16)
+        mse_loss = 0.15 * paddle.mean(mse)
+        loss += mse_loss
+
+        return loss
+
+    def get_F1_score(self, predicted, groundTruth):
+        recog_content = list(predicted.numpy())
+        gt_content = list(groundTruth[0].numpy())
+
+        # cls score
+        correct = 0
+        total = 0
+        edit = 0
+
+        for i in range(len(gt_content)):
+            total += 1
+
+            if gt_content[i] == recog_content[i]:
+                correct += 1
+
+        edit_num = self.edit_score(recog_content, gt_content)
+        edit += edit_num
+
+        tp, fp, fn = self.f_score(recog_content, gt_content, self.overlap)
+
+        # cls metric
+
+        precision = tp / float(tp + fp)
+        recall = tp / float(fp + fn)
+
+        if precision + recall > 0.0:
+            f1 = 2.0 * (precision * recall) / (precision + recall)
+        else:
+            f1 = 0.0
+        f1 = np.nan_to_num(f1)
+        return f1
+
+    def get_labels_start_end_time(self, frame_wise_labels):
+        labels = []
+        starts = []
+        ends = []
+        last_label = frame_wise_labels[0]
+        labels.append(frame_wise_labels[0])
+        starts.append(0)
+        for i in range(len(frame_wise_labels)):
+            if frame_wise_labels[i] != last_label:
+                labels.append(frame_wise_labels[i])
+                starts.append(i)
+                ends.append(i)
+                last_label = frame_wise_labels[i]
+        ends.append(i + 1)
+        return labels, starts, ends
+
+    def levenstein(self, p, y, norm=False):
+        m_row = len(p)
+        n_col = len(y)
+        D = np.zeros([m_row + 1, n_col + 1], np.float)
+        for i in range(m_row + 1):
+            D[i, 0] = i
+        for i in range(n_col + 1):
+            D[0, i] = i
+
+        for j in range(1, n_col + 1):
+            for i in range(1, m_row + 1):
+                if y[j - 1] == p[i - 1]:
+                    D[i, j] = D[i - 1, j - 1]
+                else:
+                    D[i, j] = min(D[i - 1, j] + 1, D[i, j - 1] + 1,
+                                  D[i - 1, j - 1] + 1)
+
+        if norm:
+            score = (1 - D[-1, -1] / max(m_row, n_col)) * 100
+        else:
+            score = D[-1, -1]
+
+        return score
+
+    def edit_score(self, recognized, ground_truth, norm=True):
+        P, _, _ = self.get_labels_start_end_time(recognized)
+        Y, _, _ = self.get_labels_start_end_time(ground_truth)
+        return self.levenstein(P, Y, norm)
+
+    def f_score(self, recognized, ground_truth, overlap):
+        p_label, p_start, p_end = self.get_labels_start_end_time(recognized)
+        y_label, y_start, y_end = self.get_labels_start_end_time(ground_truth)
+
+        tp = 0
+        fp = 0
+
+        hits = np.zeros(len(y_label))
+
+        for j in range(len(p_label)):
+            intersection = np.minimum(p_end[j], y_end) - np.maximum(
+                p_start[j], y_start)
+            union = np.maximum(p_end[j], y_end) - np.minimum(
+                p_start[j], y_start)
+            IoU = (1.0 * intersection / union) * (
+                [p_label[j] == y_label[x] for x in range(len(y_label))])
+            # Get the best scoring segment
+            idx = np.array(IoU).argmax()
+
+            if IoU[idx] >= overlap and not hits[idx]:
+                tp += 1
+                hits[idx] = 1
+            else:
+                fp += 1
+        fn = len(y_label) - sum(hits)
+        return float(tp), float(fp), float(fn)
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/pptimesformer_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/pptimesformer_head.py
new file mode 100644
index 0000000..113bde8
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/pptimesformer_head.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.nn import Linear
+
+from ..registry import HEADS
+from ..weight_init import trunc_normal_, weight_init_
+from .base import BaseHead
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+
+
+@HEADS.register()
+class ppTimeSformerHead(BaseHead):
+    """TimeSformerHead Head.
+
+    Args:
+        num_classes (int): The number of classes to be classified.
+        in_channels (int): The number of channles in input feature.
+        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+        std(float): Std(Scale) value in normal initilizar. Default: 0.01.
+        kwargs (dict, optional): Any keyword argument to initialize.
+
+    """
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 loss_cfg=dict(name='CrossEntropyLoss'),
+                 std=0.02,
+                 **kwargs):
+
+        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)
+        self.std = std
+        self.fc = Linear(self.in_channels,
+                         self.num_classes,
+                         bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+    def init_weights(self):
+        """Initiate the FC layer parameters"""
+
+        weight_init_(self.fc,
+                     'TruncatedNormal',
+                     'fc_0.w_0',
+                     'fc_0.b_0',
+                     mean=0.0,
+                     std=self.std)
+        # NOTE: Temporarily use trunc_normal_ instead of TruncatedNormal
+        trunc_normal_(self.fc.weight, std=self.std)
+
+    def forward(self, x):
+        """Define how the head is going to run.
+        Args:
+            x (paddle.Tensor): The input data.
+        Returns:
+            score: (paddle.Tensor) The classification scores for input samples.
+        """
+        # XXX: check dropout location!
+        # x.shape = [N, embed_dim]
+
+        score = self.fc(x)
+        # [N, num_class]
+        # x = F.softmax(x)  # NOTE remove
+        return score
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/pptsm_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/pptsm_head.py
new file mode 100644
index 0000000..45f50fd
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/pptsm_head.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import ParamAttr
+from paddle.nn import Linear
+from paddle.regularizer import L2Decay
+from .tsn_head import TSNHead
+from ..registry import HEADS
+
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class ppTSMHead(TSNHead):
+    """ ppTSM Head
+    Args:
+        num_classes (int): The number of classes to be classified.
+        in_channels (int): The number of channles in input feature.
+        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+        drop_ratio(float): drop ratio. Default: 0.8.
+        std(float): Std(Scale) value in normal initilizar. Default: 0.001.
+        kwargs (dict, optional): Any keyword argument to initialize.
+    """
+    def __init__(
+            self,
+            num_classes,
+            in_channels,  # NOTE: 2048 for >= R50, 512 for <= R34
+            drop_ratio=0.8,
+            std=0.01,
+            data_format="NCHW",
+            num_seg=8,
+            **kwargs):
+
+        super().__init__(num_classes,
+                         in_channels,
+                         drop_ratio=drop_ratio,
+                         std=std,
+                         data_format=data_format,
+                         **kwargs)
+
+        self.fc = Linear(self.in_channels,
+                         self.num_classes,
+                         weight_attr=ParamAttr(learning_rate=5.0,
+                                               regularizer=L2Decay(1e-4)),
+                         bias_attr=ParamAttr(learning_rate=10.0,
+                                             regularizer=L2Decay(0.0)))
+        self.stdv = std
+        self.num_seg = num_seg
+
+    def init_weights(self):
+        """Initiate the FC layer parameters"""
+        weight_init_(self.fc, 'Normal', 'fc_0.w_0', 'fc_0.b_0', std=self.stdv)
+
+    def forward(self, x, num_seg=None):
+        """Define how the head is going to run.
+        Args:
+            x (paddle.Tensor): The input data.
+            num_segs (int): Number of segments.
+        Returns:
+            score: (paddle.Tensor) The classification scores for input samples.
+        """
+
+        #XXX: check dropout location!
+        # [N * num_segs, in_channels, 7, 7]
+        x = self.avgpool2d(x)
+        # [N * num_segs, in_channels, 1, 1]
+        if self.dropout is not None:
+            x = self.dropout(x)
+            # [N * num_seg, in_channels, 1, 1]
+        num_seg = num_seg if num_seg is not None else self.num_seg
+        x = paddle.reshape(x, [-1, num_seg, x.shape[1]])
+        # [N, num_seg, in_channels]
+        x = paddle.mean(x, axis=1)
+        # [N, in_channels]
+        x = paddle.reshape(x, shape=[-1, self.in_channels])
+        # [N, in_channels]
+        score = self.fc(x)
+        # [N, num_class]
+        #x = F.softmax(x)  #NOTE remove
+        return score
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/pptsn_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/pptsn_head.py
new file mode 100644
index 0000000..2655c90
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/pptsn_head.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import ParamAttr
+from paddle.nn import AdaptiveAvgPool2D, Linear, Dropout
+from paddle.regularizer import L2Decay
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class ppTSNHead(BaseHead):
+    """ppTSN Head.
+
+    Args:
+        num_classes (int): The number of classes to be classified.
+        in_channels (int): The number of channles in input feature.
+        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+        drop_ratio(float): drop ratio. Default: 0.4.
+        std(float): Std(Scale) value in normal initilizar. Default: 0.01.
+        data_format(str): data format of input tensor in ['NCHW', 'NHWC']. Default: 'NCHW'.
+        fclr5(bool): Whether to increase the learning rate of the fully connected layer. Default: True
+        kwargs (dict, optional): Any keyword argument to initialize.
+
+    """
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 loss_cfg=dict(name='CrossEntropyLoss'),
+                 drop_ratio=0.4,
+                 std=0.01,
+                 data_format="NCHW",
+                 fclr5=True,
+                 **kwargs):
+
+        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)
+        self.drop_ratio = drop_ratio
+        self.std = std
+
+        # NOTE: global pool performance
+        self.avgpool2d = AdaptiveAvgPool2D((1, 1), data_format=data_format)
+
+        if self.drop_ratio != 0:
+            self.dropout = Dropout(p=self.drop_ratio)
+        else:
+            self.dropout = None
+        self.fc = Linear(
+            self.in_channels,
+            self.num_classes,
+            weight_attr=ParamAttr(learning_rate=5.0 if fclr5 else 1.0,
+                                  regularizer=L2Decay(1e-4)),
+            bias_attr=ParamAttr(learning_rate=10.0 if fclr5 else 1.0,
+                                regularizer=L2Decay(0.0)))
+
+    def init_weights(self):
+        """Initiate the FC layer parameters"""
+        weight_init_(self.fc,
+                     'Normal',
+                     'fc_0.w_0',
+                     'fc_0.b_0',
+                     mean=0.,
+                     std=self.std)
+
+    def forward(self, x, num_seg=8):
+        """Define how the head is going to run.
+
+        Args:
+            x (paddle.Tensor): The input data.
+            num_segs (int): Number of segments.
+        Returns:
+            score: (paddle.Tensor) The classification scores for input samples.
+        """
+
+        # XXX: check dropout location!
+        # [N * num_segs, in_channels, 7, 7]
+        x = self.avgpool2d(x)
+        # [N * num_segs, in_channels, 1, 1]
+        x = paddle.reshape(x, [-1, num_seg, x.shape[1]])
+        # [N, num_seg, in_channels]
+        x = paddle.mean(x, axis=1)
+        # [N, in_channels]
+        if self.dropout is not None:
+            x = self.dropout(x)
+            # [N, in_channels]
+        x = paddle.reshape(x, shape=[-1, self.in_channels])
+        # [N, in_channels]
+        score = self.fc(x)
+        # [N, num_class]
+        # x = F.softmax(x)  # NOTE remove
+        return score
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/roi_extractor.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/roi_extractor.py
new file mode 100644
index 0000000..3aaef23
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/roi_extractor.py
@@ -0,0 +1,53 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+#@register
+class RoIAlign(object):
+
+    def __init__(self,
+                 resolution=14,
+                 spatial_scale=0.0625,
+                 sampling_ratio=0,
+                 aligned=False):
+        super(RoIAlign, self).__init__()
+        self.resolution = resolution
+        self.spatial_scale = spatial_scale
+        self.sampling_ratio = sampling_ratio
+        self.aligned = aligned
+
+    def __call__(self, feats, roi, rois_num):
+        roi = paddle.concat(roi) if len(roi) > 1 else roi[0]
+        rois_num = paddle.to_tensor(rois_num, dtype='int32')
+        rois_num = paddle.cast(rois_num, dtype='int32')
+        if len(feats) == 1:
+            roi_feat = paddle.vision.ops.roi_align(feats,
+                                     roi,
+                                     rois_num,
+                                     self.resolution,
+                                     self.spatial_scale,
+                                     self.sampling_ratio,
+                                     self.aligned)
+        else:
+            rois_feat_list = []
+            roi_feat = paddle.vision.ops.roi_align(feats,
+                                     roi,
+                                     rois_num,
+                                     self.resolution,
+                                     self.spatial_scale,
+                                     self.sampling_ratio,
+                                     self.aligned)
+
+        return roi_feat
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/roi_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/roi_head.py
new file mode 100644
index 0000000..be34a33
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/roi_head.py
@@ -0,0 +1,177 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+from .. import builder
+from ..registry import HEADS
+
+
+def bbox2result(bboxes, labels, num_classes, img_shape, thr=0.01):
+    """Convert detection results to a list of numpy arrays.  """
+    if len(bboxes) == 0:
+        return list(np.zeros((num_classes - 1, 0, 5), dtype=np.float32))
+    else:
+        bboxes = bboxes[0]
+        labels = labels
+        img_shape_np = img_shape
+        img_h, img_w = img_shape_np[0][0], img_shape_np[0][1]
+
+        img_w = paddle.cast(img_w, dtype='int32')
+        img_h = paddle.cast(img_h, dtype='int32')
+
+        bboxes[:, 0::2] /= img_w
+        bboxes[:, 1::2] /= img_h
+
+        # We only handle multilabel now
+        assert labels.shape[-1] > 1
+
+        scores = labels  # rename
+        thr = (thr, ) * num_classes if isinstance(thr, float) else thr
+        assert scores.shape[1] == num_classes
+        assert len(thr) == num_classes
+
+        result = []
+        for i in range(num_classes - 1):
+            #step1. 对该类, 每个bbox的得分是否大于阈值
+            where = scores[:, i + 1] > thr[i + 1]
+
+            where = paddle.nonzero(where)  # index
+            bboxes_select = paddle.index_select(x=bboxes, index=where)
+            bboxes_select = bboxes_select[:, :4]
+
+            scores_select = paddle.index_select(x=scores, index=where)
+            scores_select = scores_select[:, i + 1:i + 2]
+
+            result.append(
+                #对于step1中得分大于阈值的bbox(可能为空), 将bbox及在该类的score放入result列表.
+                paddle.concat((bboxes_select, scores_select), axis=1).numpy())
+
+        return result
+
+
+@HEADS.register()
+class AVARoIHead(nn.Layer):
+
+    def __init__(self,
+                 assigner,
+                 sampler,
+                 pos_weight=1.0,
+                 action_thr=0.0,
+                 bbox_roi_extractor=None,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        super().__init__()
+        self.assigner = assigner
+        self.sampler = sampler
+        self.pos_weight = pos_weight
+        self.action_thr = action_thr
+        self.init_assigner_sampler()
+        if bbox_head is not None:
+            self.init_bbox_head(bbox_roi_extractor, bbox_head)
+
+    def init_assigner_sampler(self):
+        """Initialize assigner and sampler."""
+        self.bbox_assigner = None
+        self.bbox_sampler = None
+        self.bbox_assigner = builder.build_assigner(self.assigner)
+        self.bbox_sampler = builder.build_sampler(self.sampler, context=self)
+
+    def init_bbox_head(self, bbox_roi_extractor, bbox_head):
+        """Initialize ``bbox_head``"""
+        self.bbox_roi_extractor = builder.build_roi_extractor(
+            bbox_roi_extractor)
+        self.bbox_head = builder.build_head(bbox_head)
+
+    def _bbox_forward(self, x, rois, rois_num):
+        bbox_feat = self.bbox_roi_extractor(x, rois, rois_num)
+        cls_score, bbox_pred = self.bbox_head(
+            bbox_feat, rois, rois_num
+        )  #deal with: when roi's width or height = 0 , roi_align is wrong
+        bbox_results = dict(cls_score=cls_score,
+                            bbox_pred=bbox_pred,
+                            bbox_feats=bbox_feat)
+        return bbox_results
+
+    def _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels):
+        """Run forward function and calculate loss for box head in training."""
+        rois = [res.bboxes for res in sampling_results]
+        rois_num = [res.bboxes.shape[0] for res in sampling_results]
+        bbox_results = self._bbox_forward(x, rois, rois_num)
+        bbox_targets = self.bbox_head.get_targets(sampling_results, gt_bboxes,
+                                                  gt_labels, self.pos_weight)
+        loss_bbox = self.bbox_head.loss(bbox_results['cls_score'], bbox_targets)
+        bbox_results.update(loss_bbox=loss_bbox)
+        return bbox_results
+
+    def train_step(self, x, img_metas, proposal_list, gt_bboxes, gt_labels):
+        #1. assign gts and sample proposals
+        num_imgs = len(img_metas[0])
+        sampling_results = []
+        for i in range(num_imgs):
+            assign_result = self.bbox_assigner.assign(proposal_list[i],
+                                                      gt_bboxes[i],
+                                                      gt_labels[i])
+            sampling_result = self.bbox_sampler.sample(assign_result,
+                                                       proposal_list[i],
+                                                       gt_bboxes[i],
+                                                       gt_labels[i])
+            sampling_results.append(sampling_result)
+
+        #2. forward and loss
+        bbox_results = self._bbox_forward_train(x, sampling_results, gt_bboxes,
+                                                gt_labels)
+        losses = dict()
+        losses.update(bbox_results['loss_bbox'])
+
+        return losses
+
+    def simple_test(self, x, proposal_list, img_shape, rescale=False):
+        x_shape = x[0].shape
+        #assert x_shape[0] == 1, 'only accept 1 sample at test mode'
+
+        det_bboxes, det_labels = self.simple_test_bboxes(x,
+                                                         img_shape,
+                                                         proposal_list,
+                                                         self.action_thr,
+                                                         rescale=rescale)
+
+        bbox_results = bbox2result(det_bboxes, det_labels,
+                                   self.bbox_head.num_classes, img_shape,
+                                   self.action_thr)
+        return [bbox_results]
+
+    def simple_test_bboxes(self,
+                           x,
+                           img_shape,
+                           proposals,
+                           action_thr,
+                           rescale=False):
+        """Test only det bboxes without augmentation."""
+        rois = [proposals]
+        rois_num = [rois[0].shape[0]]
+        bbox_results = self._bbox_forward(x, rois, rois_num)
+        cls_score = bbox_results['cls_score']
+        crop_quadruple = np.array([0, 0, 1, 1])
+        flip = False
+        det_bboxes, det_labels = self.bbox_head.get_det_bboxes(
+            rois,
+            cls_score,
+            img_shape,
+            flip=flip,
+            crop_quadruple=crop_quadruple)
+
+        return det_bboxes, det_labels
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/single_straight3d.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/single_straight3d.py
new file mode 100644
index 0000000..805d93e
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/single_straight3d.py
@@ -0,0 +1,79 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.nn as nn
+import numpy as np
+from ..registry import ROI_EXTRACTORS
+from .roi_extractor import RoIAlign
+
+
+@ROI_EXTRACTORS.register()
+class SingleRoIExtractor3D(nn.Layer):
+    """Extract RoI features from a single level feature map.  """
+    def __init__(self,
+                 roi_layer_type='RoIAlign',
+                 featmap_stride=16,
+                 output_size=16,
+                 sampling_ratio=0,
+                 pool_mode='avg',
+                 aligned=True,
+                 with_temporal_pool=True,
+                 with_global=False):
+        super().__init__()
+        self.roi_layer_type = roi_layer_type
+        assert self.roi_layer_type in ['RoIPool', 'RoIAlign']
+        self.featmap_stride = featmap_stride
+        self.spatial_scale = 1. / self.featmap_stride
+        self.output_size = output_size
+        self.sampling_ratio = sampling_ratio
+        self.pool_mode = pool_mode
+        self.aligned = aligned
+        self.with_temporal_pool = with_temporal_pool
+        self.with_global = with_global
+
+        self.roi_layer = RoIAlign(resolution=self.output_size,
+                                  spatial_scale=self.spatial_scale,
+                                  sampling_ratio=self.sampling_ratio,
+                                  aligned=self.aligned)
+
+    def init_weights(self):
+        pass
+
+    # The shape of feat is N, C, T, H, W
+    def forward(self, feat, rois, rois_num):
+        if len(feat) >= 2:
+            assert self.with_temporal_pool
+        if self.with_temporal_pool:
+            xi = 0
+            for x in feat:
+                xi = xi + 1
+                y = paddle.mean(x, 2, keepdim=True)
+            feat = [paddle.mean(x, 2, keepdim=True) for x in feat]
+        feat = paddle.concat(feat, axis=1)  # merge slow and fast
+        roi_feats = []
+        for t in range(feat.shape[2]):
+            if type(t) == paddle.static.Variable:
+                index = paddle.to_tensor(t)
+            else:
+                data_index = np.array([t]).astype('int32')
+                index = paddle.to_tensor(data_index)
+
+            frame_feat = paddle.index_select(feat, index, axis=2)
+            frame_feat = paddle.squeeze(frame_feat,
+                                        axis=2)  #axis=2,避免N=1时, 第一维度被删除.
+            roi_feat = self.roi_layer(frame_feat, rois, rois_num)
+            roi_feats.append(roi_feat)
+
+        ret = paddle.stack(roi_feats, axis=2)
+        return ret
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/slowfast_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/slowfast_head.py
new file mode 100644
index 0000000..bd18baf
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/slowfast_head.py
@@ -0,0 +1,137 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..registry import HEADS
+from .base import BaseHead
+
+import paddle
+import paddle.nn.functional as F
+
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class SlowFastHead(BaseHead):
+    """
+    ResNe(X)t 3D head.
+    This layer performs a fully-connected projection during training, when the
+    input size is 1x1x1. It performs a convolutional projection during testing
+    when the input size is larger than 1x1x1. If the inputs are from multiple
+    different pathways, the inputs will be concatenated after pooling.
+    """
+    def __init__(self,
+                 width_per_group,
+                 alpha,
+                 beta,
+                 num_classes,
+                 num_frames,
+                 crop_size,
+                 dropout_rate,
+                 pool_size_ratio=[[1, 1, 1], [1, 1, 1]],
+                 loss_cfg=dict(name='CrossEntropyLoss'),
+                 multigrid_short=False,
+                 **kwargs):
+        """
+        ResNetBasicHead takes p pathways as input where p in [1, infty].
+
+        Args:
+            dim_in (list): the list of channel dimensions of the p inputs to the
+                ResNetHead.
+            num_classes (int): the channel dimensions of the p outputs to the
+                ResNetHead.
+            pool_size (list): the list of kernel sizes of p spatial temporal
+                poolings, temporal pool kernel size, spatial pool kernel size,
+                spatial pool kernel size in order.
+            dropout_rate (float): dropout rate. If equal to 0.0, perform no
+                dropout.
+        """
+        super().__init__(num_classes, loss_cfg, **kwargs)
+        self.multigrid_short = multigrid_short
+        self.width_per_group = width_per_group
+        self.alpha = alpha
+        self.beta = beta
+        self.num_classes = num_classes
+        self.num_frames = num_frames
+        self.crop_size = crop_size
+        self.dropout_rate = dropout_rate
+        self.pool_size_ratio = pool_size_ratio
+
+        self.dim_in = [
+            self.width_per_group * 32,
+            self.width_per_group * 32 // self.beta,
+        ]
+        self.pool_size = [None, None] if self.multigrid_short else [
+            [
+                self.num_frames // self.alpha // self.pool_size_ratio[0][0],
+                self.crop_size // 32 // self.pool_size_ratio[0][1],
+                self.crop_size // 32 // self.pool_size_ratio[0][2],
+            ],
+            [
+                self.num_frames // self.pool_size_ratio[1][0],
+                self.crop_size // 32 // self.pool_size_ratio[1][1],
+                self.crop_size // 32 // self.pool_size_ratio[1][2],
+            ],
+        ]
+
+        assert (len({len(self.pool_size), len(self.dim_in)
+                     }) == 1), "pathway dimensions are not consistent."
+        self.num_pathways = len(self.pool_size)
+
+        self.dropout = paddle.nn.Dropout(p=self.dropout_rate)
+
+        self.projection = paddle.nn.Linear(
+            in_features=sum(self.dim_in),
+            out_features=self.num_classes,
+        )
+
+    def init_weights(self):
+        weight_init_(self.projection,
+                     "Normal",
+                     bias_value=0.0,
+                     mean=0.0,
+                     std=0.01)
+
+    def forward(self, inputs):
+        assert (len(inputs) == self.num_pathways
+                ), "Input tensor does not contain {} pathway".format(
+                    self.num_pathways)
+        pool_out = []
+        for pathway in range(self.num_pathways):
+            if self.pool_size[pathway] is None:
+                tmp_out = F.adaptive_avg_pool3d(x=inputs[pathway],
+                                                output_size=(1, 1, 1),
+                                                data_format="NCDHW")
+            else:
+                tmp_out = F.avg_pool3d(x=inputs[pathway],
+                                       kernel_size=self.pool_size[pathway],
+                                       stride=1,
+                                       data_format="NCDHW")
+            pool_out.append(tmp_out)
+
+        x = paddle.concat(x=pool_out, axis=1)
+        x = paddle.transpose(x=x, perm=(0, 2, 3, 4, 1))
+
+        # Perform dropout.
+        if self.dropout_rate > 0.0:
+            x = self.dropout(x)
+
+        x = self.projection(x)
+
+        # Performs fully convlutional inference.
+        if not self.training:  # attr of base class
+            x = F.softmax(x, axis=4)
+            x = paddle.mean(x, axis=[1, 2, 3])
+
+        x = paddle.reshape(x, shape=(x.shape[0], -1))
+        return x
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/stgcn_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/stgcn_head.py
new file mode 100644
index 0000000..fc80d66
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/stgcn_head.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class STGCNHead(BaseHead):
+    """
+    Head for ST-GCN model.
+    Args:
+        in_channels: int, input feature channels. Default: 256.
+        num_classes: int, number classes. Default: 10.
+    """
+    def __init__(self, in_channels=256, num_classes=10, **kwargs):
+        super().__init__(num_classes, in_channels, **kwargs)
+        self.fcn = nn.Conv2D(in_channels=in_channels,
+                             out_channels=num_classes,
+                             kernel_size=1)
+
+    def init_weights(self):
+        """Initiate the parameters.
+        """
+        for layer in self.sublayers():
+            if isinstance(layer, nn.Conv2D):
+                weight_init_(layer, 'Normal', std=0.02)
+
+    def forward(self, x):
+        """Define how the head is going to run.
+        """
+        x = self.fcn(x)
+        x = paddle.reshape_(x, (x.shape[0], -1))  # N,C,1,1 --> N,C
+
+        return x
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/timesformer_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/timesformer_head.py
new file mode 100644
index 0000000..d02a3cc
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/timesformer_head.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.nn import Linear
+
+from ..registry import HEADS
+from ..weight_init import trunc_normal_, weight_init_
+from .base import BaseHead
+
+
+@HEADS.register()
+class TimeSformerHead(BaseHead):
+    """TimeSformerHead Head.
+
+    Args:
+        num_classes (int): The number of classes to be classified.
+        in_channels (int): The number of channles in input feature.
+        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+        std(float): Std(Scale) value in normal initilizar. Default: 0.01.
+        kwargs (dict, optional): Any keyword argument to initialize.
+
+    """
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 loss_cfg=dict(name='CrossEntropyLoss'),
+                 std=0.02,
+                 **kwargs):
+
+        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)
+        self.std = std
+        self.fc = Linear(self.in_channels, self.num_classes)
+
+    def init_weights(self):
+        """Initiate the FC layer parameters"""
+
+        weight_init_(self.fc,
+                     'TruncatedNormal',
+                     'fc_0.w_0',
+                     'fc_0.b_0',
+                     mean=0.0,
+                     std=self.std)
+        # NOTE: Temporarily use trunc_normal_ instead of TruncatedNormal
+        trunc_normal_(self.fc.weight, std=self.std)
+
+    def forward(self, x):
+        """Define how the head is going to run.
+        Args:
+            x (paddle.Tensor): The input data.
+        Returns:
+            score: (paddle.Tensor) The classification scores for input samples.
+        """
+        # XXX: check dropout location!
+        # x.shape = [N, embed_dim]
+
+        score = self.fc(x)
+        # [N, num_class]
+        # x = F.softmax(x)  # NOTE remove
+        return score
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/token_shift_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/token_shift_head.py
new file mode 100644
index 0000000..52e9309
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/token_shift_head.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.nn import Linear
+import paddle
+
+from ..registry import HEADS
+from ..weight_init import trunc_normal_, weight_init_
+from .base import BaseHead
+
+
+@HEADS.register()
+class TokenShiftHead(BaseHead):
+    """TokenShift Transformer Head.
+
+    Args:
+        num_classes (int): The number of classes to be classified.
+        in_channels (int): The number of channles in input feature.
+        num_seg(int): The number of segments. Default: 8. 
+        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+        ls_eps (float): Label smoothing epsilon. Default: 0.01.
+        std (float): Std(Scale) Value in normal initilizar. Default: 0.02.
+        kwargs (dict, optional): Any keyword argument to initialize.
+
+    """
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 num_seg=8,
+                 loss_cfg=dict(name='CrossEntropyLoss'),
+                 ls_eps=0.01,
+                 std=0.02,
+                 **kwargs):
+
+        super().__init__(num_classes, in_channels, loss_cfg, ls_eps)
+        self.num_seg = num_seg
+        self.std = std
+        self.fc = Linear(self.in_channels, self.num_classes)
+
+    def init_weights(self):
+        """Initiate the FC layer parameters"""
+
+        weight_init_(self.fc,
+                     'TruncatedNormal',
+                     'fc_0.w_0',
+                     'fc_0.b_0',
+                     mean=0.0,
+                     std=self.std)
+        # NOTE: Temporarily use trunc_normal_ instead of TruncatedNormal
+        trunc_normal_(self.fc.weight, std=self.std)
+
+    def forward(self, x):
+        """Define how the head is going to run.
+        Args:
+            x (paddle.Tensor): The input data.
+        Returns:
+            score: (paddle.Tensor) The classification scores for input samples.
+        """
+        # XXX: check dropout location!
+        # x.shape = [N, embed_dim]
+        score = self.fc(x)
+        # [N*T, num_class]
+        _, _m = score.shape
+        _t = self.num_seg
+        score = score.reshape([-1, _t, _m])
+        score = paddle.mean(score, 1)  # averaging predictions for every frame
+        score = paddle.squeeze(score, axis=1)
+        return score
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/transnetv2_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/transnetv2_head.py
new file mode 100644
index 0000000..2ea67d4
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/transnetv2_head.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..losses import TransNetV2Loss
+from ...metrics.transnetv2_metric import create_scene_based_summaries
+
+@HEADS.register()
+class TransNetV2Head(BaseHead):
+    """TransNetV2 Head.
+    """
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 loss_cfg=dict(name="TransNetV2Loss")
+                 ):
+        super().__init__(num_classes,
+                         in_channels,
+                         loss_cfg)
+
+    def loss(self, one_hot_pred, one_hot_gt,
+                many_hot_pred=None, many_hot_gt=None, reg_losses=None):
+        losses = dict()
+        loss = self.loss_func(scores, labels, **kwargs)
+
+        f1 = self.get_score(one_hot_pred, one_hot_gt)
+        losses['f1'] = f1
+        losses['loss'] = loss
+        return losses
+
+    def get_score(self, one_hot_pred, one_hot_gt):
+        f1 = create_scene_based_summaries(one_hot_pred, one_hot_gt)
+        return f1
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/tsm_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/tsm_head.py
new file mode 100644
index 0000000..9559301
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/tsm_head.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+from paddle import ParamAttr
+from paddle.nn import Linear
+import paddle.nn.functional as F
+from paddle.regularizer import L2Decay
+from .tsn_head import TSNHead
+from ..registry import HEADS
+
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class TSMHead(TSNHead):
+    """ TSM Head
+
+    Args:
+        num_classes (int): The number of classes to be classified.
+        in_channels (int): The number of channles in input feature.
+        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+        drop_ratio(float): drop ratio. Default: 0.5.
+        std(float): Std(Scale) value in normal initilizar. Default: 0.001.
+        kwargs (dict, optional): Any keyword argument to initialize.
+    """
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 drop_ratio=0.5,
+                 std=0.001,
+                 data_format="NCHW",
+                 **kwargs):
+        super().__init__(num_classes,
+                         in_channels,
+                         drop_ratio=drop_ratio,
+                         std=std,
+                         data_format=data_format,
+                         **kwargs)
+
+        self.fc = Linear(self.in_channels,
+                         self.num_classes,
+                         weight_attr=ParamAttr(learning_rate=5.0,
+                                               regularizer=L2Decay(1e-4)),
+                         bias_attr=ParamAttr(learning_rate=10.0,
+                                             regularizer=L2Decay(0.0)))
+
+        assert (data_format in [
+            'NCHW', 'NHWC'
+        ]), f"data_format must be 'NCHW' or 'NHWC', but got {data_format}"
+
+        self.data_format = data_format
+
+        self.stdv = std
+
+    def init_weights(self):
+        """Initiate the FC layer parameters"""
+        weight_init_(self.fc, 'Normal', 'fc_0.w_0', 'fc_0.b_0', std=self.stdv)
+
+    def forward(self, x, num_seg):
+        """Define how the tsm-head is going to run.
+
+        Args:
+            x (paddle.Tensor): The input data.
+            num_segs (int): Number of segments.
+        Returns:
+            score: (paddle.Tensor) The classification scores for input samples.
+        """
+        # x.shape = [N * num_segs, in_channels, 7, 7]
+
+        x = self.avgpool2d(x)  # [N * num_segs, in_channels, 1, 1]
+
+        if self.dropout is not None:
+            x = self.dropout(x)  # [N * num_seg, in_channels, 1, 1]
+
+        if self.data_format == 'NCHW':
+            x = paddle.reshape(x, x.shape[:2])
+        else:
+            x = paddle.reshape(x, x.shape[::3])
+        score = self.fc(x)  # [N * num_seg, num_class]
+        score = paddle.reshape(
+            score, [-1, num_seg, score.shape[1]])  # [N, num_seg, num_class]
+        score = paddle.mean(score, axis=1)  # [N, num_class]
+        score = paddle.reshape(score,
+                               shape=[-1, self.num_classes])  # [N, num_class]
+        # score = F.softmax(score)  #NOTE remove
+        return score
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/tsn_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/tsn_head.py
new file mode 100644
index 0000000..f2f906b
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/tsn_head.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.nn import AdaptiveAvgPool2D, Linear, Dropout
+
+from .base import BaseHead
+from ..registry import HEADS
+from ..weight_init import weight_init_
+
+
+@HEADS.register()
+class TSNHead(BaseHead):
+    """TSN Head.
+
+    Args:
+        num_classes (int): The number of classes to be classified.
+        in_channels (int): The number of channles in input feature.
+        loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss').
+        drop_ratio(float): drop ratio. Default: 0.4.
+        std(float): Std(Scale) value in normal initilizar. Default: 0.01.
+        kwargs (dict, optional): Any keyword argument to initialize.
+
+    """
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 loss_cfg=dict(name='CrossEntropyLoss'),
+                 drop_ratio=0.4,
+                 std=0.01,
+                 data_format="NCHW",
+                 **kwargs):
+
+        super().__init__(num_classes, in_channels, loss_cfg, **kwargs)
+        self.drop_ratio = drop_ratio
+        self.std = std
+
+        #NOTE: global pool performance
+        self.avgpool2d = AdaptiveAvgPool2D((1, 1), data_format=data_format)
+
+        if self.drop_ratio != 0:
+            self.dropout = Dropout(p=self.drop_ratio)
+        else:
+            self.dropout = None
+
+        self.fc = Linear(self.in_channels, self.num_classes)
+
+    def init_weights(self):
+        """Initiate the FC layer parameters"""
+
+        weight_init_(self.fc,
+                     'Normal',
+                     'fc_0.w_0',
+                     'fc_0.b_0',
+                     mean=0.,
+                     std=self.std)
+
+    def forward(self, x, num_seg):
+        """Define how the head is going to run.
+        Args:
+            x (paddle.Tensor): The input data.
+            num_segs (int): Number of segments.
+        Returns:
+            score: (paddle.Tensor) The classification scores for input samples.
+        """
+
+        #XXX: check dropout location!
+        # [N * num_segs, in_channels, 7, 7]
+
+        x = self.avgpool2d(x)
+        # [N * num_segs, in_channels, 1, 1]
+        x = paddle.reshape(x, [-1, num_seg, x.shape[1]])
+        # [N, num_seg, in_channels]
+        x = paddle.mean(x, axis=1)
+        # [N, in_channels]
+        if self.dropout is not None:
+            x = self.dropout(x)
+            # [N, in_channels]
+        score = self.fc(x)
+        # [N, num_class]
+        #x = F.softmax(x)  #NOTE remove
+        return score
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__init__.py
new file mode 100644
index 0000000..d784c4c
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import BaseWeightedLoss
+from .bmn_loss import BMNLoss
+from .cross_entropy_loss import CrossEntropyLoss
+from .depth_loss import ADDSLoss
+from .transnetv2_loss import TransNetV2Loss
+from .actbert_loss import ActBertLoss
+from .asrf_loss import ASRFLoss
+from .distillation_loss import DistillationCELoss, DistillationDMLLoss
+from .yowo_loss import RegionLoss
+
+__all__ = [
+    'CrossEntropyLoss', 'BMNLoss', 'TransNetV2Loss', 'ActBertLoss', 'ADDSLoss',
+    'BaseWeightedLoss', 'ASRFLoss', 'DistillationCELoss', 'DistillationDMLLoss',
+    'RegionLoss'
+]
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..72125a0
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/actbert_loss.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/actbert_loss.cpython-310.pyc
new file mode 100644
index 0000000..f4ac760
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/actbert_loss.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/asrf_loss.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/asrf_loss.cpython-310.pyc
new file mode 100644
index 0000000..ad94ef8
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/asrf_loss.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000..20b1111
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/base.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/bmn_loss.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/bmn_loss.cpython-310.pyc
new file mode 100644
index 0000000..d22aa00
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/bmn_loss.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/cross_entropy_loss.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/cross_entropy_loss.cpython-310.pyc
new file mode 100644
index 0000000..f3d06b5
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/cross_entropy_loss.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/depth_loss.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/depth_loss.cpython-310.pyc
new file mode 100644
index 0000000..1f884a1
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/depth_loss.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/distillation_loss.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/distillation_loss.cpython-310.pyc
new file mode 100644
index 0000000..b766587
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/distillation_loss.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/transnetv2_loss.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/transnetv2_loss.cpython-310.pyc
new file mode 100644
index 0000000..6463772
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/transnetv2_loss.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/yowo_loss.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/yowo_loss.cpython-310.pyc
new file mode 100644
index 0000000..d69a0a5
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/yowo_loss.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/actbert_loss.py b/Bank_second_part/detect_process/paddlevideo/modeling/losses/actbert_loss.py
new file mode 100644
index 0000000..10ffea6
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/losses/actbert_loss.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ..registry import LOSSES
+from .base import BaseWeightedLoss
+
+
+@LOSSES.register()
+class ActBertLoss(BaseWeightedLoss):
+    """Loss for ActBert model
+    """
+    def __init__(self, vocab_size=30522, a_target_size=700):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.a_target_size = a_target_size
+        self.loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
+        self.vis_criterion = nn.KLDivLoss(reduction="none")
+
+    def forward(self, prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score, \
+                text_labels, image_label, image_target, action_label, next_sentence_label):
+        """
+        Args:
+            text_label: text label(with mask). Shape: [batch_size, seqence_length]
+            image_label: image label(with mask). Shape: [batch_size, region_length]
+            image_target: label of image feature distribution,
+                            Shape: [batch_size, region_length-1, num_image_class](minus 1 for xxx).
+            action label: action label(with mask), Shape: [batch_size, action_length]
+            next_sentence_label: is next sentence or not. Shape: [batch_size]
+        """
+        prediction_scores_v = prediction_scores_v[:,
+                                                  1:]  #8,37,1601 --> 8,36,1601
+
+        img_loss = self.vis_criterion(
+            F.log_softmax(prediction_scores_v, axis=2),
+            image_target  #8,36,1601
+        )
+        masked_img_loss = paddle.sum(
+            img_loss * (image_label == 1).unsqueeze(2).astype('float32')) / max(
+                paddle.sum((image_label == 1).astype('float32')), 1e-6)
+
+        masked_text_loss = self.loss_fct(
+            prediction_scores_t.reshape([-1, self.vocab_size]),  #8,36,30522
+            text_labels.reshape([-1]),  #8,36   # label -1 will be ignored
+        )
+
+        masked_action_loss = self.loss_fct(
+            prediction_scores_a.reshape([-1, self.a_target_size]),  #8,5,700
+            action_label.reshape([-1]),  #8,5
+        )
+
+        next_sentence_loss = self.loss_fct(
+            seq_relationship_score.reshape([-1, 2]),
+            next_sentence_label.reshape([-1])  #8,2
+        )
+
+        total_loss = masked_text_loss.unsqueeze(0) + masked_img_loss.unsqueeze(
+            0) + masked_action_loss.unsqueeze(0) + next_sentence_loss.unsqueeze(
+                0)
+        return total_loss
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/asrf_loss.py b/Bank_second_part/detect_process/paddlevideo/modeling/losses/asrf_loss.py
new file mode 100644
index 0000000..ce5d6b1
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/losses/asrf_loss.py
@@ -0,0 +1,401 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# https://github.com/yiskw713/asrf/libs/loss_fn/__init__.py
+
+import numpy as np
+import pandas as pd
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import sys
+import os
+
+from ..registry import LOSSES
+
+
+class TMSE(nn.Layer):
+    """
+    Temporal MSE Loss Function
+    Proposed in Y. A. Farha et al. MS-TCN: Multi-Stage Temporal Convolutional Network for ActionSegmentation in CVPR2019
+    arXiv: https://arxiv.org/pdf/1903.01945.pdf
+    """
+
+    def __init__(self, threshold=4, ignore_index=255):
+        super().__init__()
+        self.threshold = threshold
+        self.ignore_index = ignore_index
+        self.mse = nn.MSELoss(reduction="none")
+
+    def forward(self, preds, gts):
+
+        total_loss = 0.0
+        batch_size = preds.shape[0]
+        for pred, gt in zip(preds, gts):
+            pred = paddle.gather(pred,
+                                 paddle.nonzero(gt != self.ignore_index)[:, 0])
+
+            loss = self.mse(F.log_softmax(pred[:, 1:], axis=1),
+                            F.log_softmax(pred[:, :-1], axis=1))
+
+            loss = paddle.clip(loss, min=0, max=self.threshold**2)
+            total_loss += paddle.mean(loss)
+
+        return total_loss / batch_size
+
+
+class GaussianSimilarityTMSE(nn.Layer):
+    """
+    Temporal MSE Loss Function with Gaussian Similarity Weighting
+    """
+
+    def __init__(self, threshold=4, sigma=1.0, ignore_index=255):
+        super().__init__()
+        self.threshold = threshold
+        self.ignore_index = ignore_index
+        self.mse = nn.MSELoss(reduction="none")
+        self.sigma = sigma
+
+    def forward(self, preds, gts, sim_index):
+        """
+        Args:
+            preds: the output of model before softmax. (N, C, T)
+            gts: Ground Truth. (N, T)
+            sim_index: similarity index. (N, C, T)
+        Return:
+            the value of Temporal MSE weighted by Gaussian Similarity.
+        """
+        total_loss = 0.0
+        batch_size = preds.shape[0]
+        for pred, gt, sim in zip(preds, gts, sim_index):
+            pred = paddle.gather(pred,
+                                 paddle.nonzero(gt != self.ignore_index)[:, 0],
+                                 axis=1)
+            sim = paddle.gather(sim,
+                                paddle.nonzero(gt != self.ignore_index)[:, 0],
+                                axis=1)
+
+            # calculate gaussian similarity
+            diff = sim[:, 1:] - sim[:, :-1]
+            similarity = paddle.exp(
+                (-1 * paddle.norm(diff, axis=0)) / (2 * self.sigma**2))
+
+            # calculate temporal mse
+            loss = self.mse(F.log_softmax(pred[:, 1:], axis=1),
+                            F.log_softmax(pred[:, :-1], axis=1))
+            loss = paddle.clip(loss, min=0, max=self.threshold**2)
+
+            # gaussian similarity weighting
+            loss = similarity * loss
+
+            total_loss += paddle.mean(loss)
+
+        return total_loss / batch_size
+
+
+class FocalLoss(nn.Layer):
+
+    def __init__(self,
+                 weight=None,
+                 size_average=True,
+                 batch_average=True,
+                 ignore_index=255,
+                 gamma=2.0,
+                 alpha=0.25):
+        super().__init__()
+
+        self.gamma = gamma
+        self.alpha = alpha
+        self.batch_average = batch_average
+        self.criterion = nn.CrossEntropyLoss(weight=weight,
+                                             ignore_index=ignore_index,
+                                             size_average=size_average)
+
+    def forward(self, logit, target):
+        n, _, _ = logit.size()
+
+        logpt = -self.criterion(logit, target.long())
+        pt = paddle.exp(logpt)
+
+        if self.alpha is not None:
+            logpt *= self.alpha
+
+        loss = -((1 - pt)**self.gamma) * logpt
+
+        if self.batch_average:
+            loss /= n
+
+        return loss
+
+
+class ActionSegmentationLoss(nn.Layer):
+    """
+    Loss Function for Action Segmentation
+    You can choose the below loss functions and combine them.
+        - Cross Entropy Loss (CE)
+        - Focal Loss
+        - Temporal MSE (TMSE)
+        - Gaussian Similarity TMSE (GSTMSE)
+    """
+
+    def __init__(self,
+                 num_classes,
+                 file_path,
+                 label_path,
+                 ce=True,
+                 focal=True,
+                 tmse=False,
+                 gstmse=False,
+                 weight=None,
+                 threshold=4.,
+                 ignore_index=255,
+                 ce_weight=1.0,
+                 focal_weight=1.0,
+                 tmse_weight=0.15,
+                 gstmse_weight=0.15):
+        super().__init__()
+        self.criterions = []
+        self.weights = []
+
+        self.num_classes = num_classes
+        self.file_path = file_path
+        self.label_path = label_path
+        if weight:
+            class_weight = self.get_class_weight()
+        else:
+            class_weight = None
+
+        if ce:
+            self.criterions.append(
+                nn.CrossEntropyLoss(weight=class_weight,
+                                    ignore_index=ignore_index))
+            self.weights.append(ce_weight)
+
+        if focal:
+            self.criterions.append(FocalLoss(ignore_index=ignore_index))
+            self.weights.append(focal_weight)
+
+        if tmse:
+            self.criterions.append(
+                TMSE(threshold=threshold, ignore_index=ignore_index))
+            self.weights.append(tmse_weight)
+
+        if gstmse:
+            self.criterions.append(
+                GaussianSimilarityTMSE(threshold=threshold,
+                                       ignore_index=ignore_index))
+            self.weights.append(gstmse_weight)
+
+        if len(self.criterions) == 0:
+            print("You have to choose at least one loss function.")
+            sys.exit(1)
+
+    def get_class_weight(self):
+        """
+        Class weight for CrossEntropy
+        Class weight is calculated in the way described in:
+            D. Eigen and R. Fergus, “Predicting depth, surface normals and semantic labels with a common multi-scale convolutional architecture,” in ICCV,
+            openaccess: https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/Eigen_Predicting_Depth_Surface_ICCV_2015_paper.pdf
+        """
+        # load file list
+        file_ptr = open(self.file_path, 'r')
+        info = file_ptr.read().split('\n')[:-1]
+        file_ptr.close()
+
+        nums = [0 for i in range(self.num_classes)]
+        for i in range(len(info)):
+            video_name = info[i]
+            file_name = video_name.split('.')[0] + ".npy"
+            label_file_path = os.path.join(self.label_path, file_name)
+            label = np.load(label_file_path).astype(np.int64)
+            num, cnt = np.unique(label, return_counts=True)
+            for n, c in zip(num, cnt):
+                nums[n] += c
+
+        class_num = paddle.to_tensor(nums, dtype="float32")
+        total = class_num.sum().item()
+        frequency = class_num / total
+        median = paddle.median(frequency)
+        class_weight = median / frequency
+        return class_weight
+
+    def forward(self, preds, gts, sim_index):
+        """
+        Args:
+            preds: paddle.float (N, C, T).
+            gts: paddle.int64 (N, T).
+            sim_index: paddle.float (N, C', T).
+        """
+        loss = 0.0
+        for criterion, weight in zip(self.criterions, self.weights):
+            if isinstance(criterion, GaussianSimilarityTMSE):
+                loss += weight * criterion(preds, gts, sim_index)
+            elif isinstance(criterion, nn.CrossEntropyLoss):
+                preds_t = paddle.transpose(preds, perm=[0, 2, 1])
+                loss += weight * criterion(preds_t, gts)
+            else:
+                loss += weight * criterion(preds, gts)
+
+        return loss
+
+
+class BoundaryRegressionLoss(nn.Layer):
+    """
+    Boundary Regression Loss
+        bce: Binary Cross Entropy Loss for Boundary Prediction
+        mse: Mean Squared Error
+    """
+
+    def __init__(self,
+                 file_path,
+                 label_path,
+                 bce=True,
+                 focal=False,
+                 mse=False,
+                 weight=None,
+                 pos_weight=None):
+        super().__init__()
+
+        self.criterions = []
+        self.file_path = file_path
+        self.label_path = label_path
+
+        pos_weight = self.get_pos_weight()
+
+        if bce:
+            self.criterions.append(
+                nn.BCEWithLogitsLoss(weight=weight, pos_weight=pos_weight))
+
+        if focal:
+            self.criterions.append(FocalLoss())
+
+        if mse:
+            self.criterions.append(nn.MSELoss())
+
+        if len(self.criterions) == 0:
+            print("You have to choose at least one loss function.")
+            sys.exit(1)
+
+    def get_pos_weight(self, norm=None):
+        """
+        pos_weight for binary cross entropy with logits loss
+        pos_weight is defined as reciprocal of ratio of positive samples in the dataset
+        """
+        # load file list
+        file_ptr = open(self.file_path, 'r')
+        info = file_ptr.read().split('\n')[:-1]
+        file_ptr.close()
+
+        n_classes = 2  # boundary or not
+        nums = [0 for i in range(n_classes)]
+        for i in range(len(info)):
+            video_name = info[i]
+            file_name = video_name.split('.')[0] + ".npy"
+            label_file_path = os.path.join(self.label_path, file_name)
+            label = np.load(label_file_path).astype(np.int64)
+            num, cnt = np.unique(label, return_counts=True)
+            for n, c in zip(num, cnt):
+                nums[n] += c
+
+        pos_ratio = nums[1] / sum(nums)
+        pos_weight = 1 / pos_ratio
+
+        if norm is not None:
+            pos_weight /= norm
+
+        return paddle.to_tensor(pos_weight, dtype="float32")
+
+    def forward(self, preds, gts):
+        """
+        Args:
+            preds: paddle.float (N, 1, T).
+            gts: paddle.float (N, 1, T).
+        """
+        loss = 0.0
+        batch_size = float(preds.shape[0])
+
+        for criterion in self.criterions:
+            for pred, gt in zip(preds, gts):
+                loss += criterion(pred, gt)
+
+        return loss / batch_size
+
+
+@LOSSES.register()
+class ASRFLoss(nn.Layer):
+
+    def __init__(self,
+                 lambda_bound_loss,
+                 num_classes,
+                 file_path,
+                 label_path,
+                 boundary_path,
+                 ce=True,
+                 asl_focal=True,
+                 tmse=False,
+                 gstmse=False,
+                 asl_weight=None,
+                 threshold=4.,
+                 ignore_index=255,
+                 ce_weight=1.0,
+                 focal_weight=1.0,
+                 tmse_weight=0.15,
+                 gstmse_weight=0.15,
+                 bce=True,
+                 brl_focal=False,
+                 mse=False,
+                 brl_weight=None):
+        super().__init__()
+        self.criterion_cls = ActionSegmentationLoss(ce=ce,
+                                                    focal=asl_focal,
+                                                    tmse=tmse,
+                                                    gstmse=gstmse,
+                                                    weight=asl_weight,
+                                                    threshold=threshold,
+                                                    ignore_index=ignore_index,
+                                                    ce_weight=ce_weight,
+                                                    focal_weight=focal_weight,
+                                                    tmse_weight=tmse_weight,
+                                                    gstmse_weight=gstmse_weight,
+                                                    file_path=file_path,
+                                                    label_path=label_path,
+                                                    num_classes=num_classes)
+        self.criterion_boundary = BoundaryRegressionLoss(
+            bce=bce,
+            focal=brl_focal,
+            mse=mse,
+            weight=brl_weight,
+            file_path=file_path,
+            label_path=boundary_path)
+        self.lambda_bound_loss = lambda_bound_loss
+
+    def forward(self, x, output_cls, label, outputs_boundary, boundary):
+        loss = 0.0
+        if isinstance(output_cls, list):
+            n = len(output_cls)
+            for out in output_cls:
+                loss += self.criterion_cls(out, label, x) / n
+        else:
+            loss += self.criterion_cls(output_cls, label, x)
+
+        if isinstance(outputs_boundary, list):
+            n = len(outputs_boundary)
+            for out in outputs_boundary:
+                loss += self.lambda_bound_loss * self.criterion_boundary(
+                    out, boundary) / n
+        else:
+            loss += self.lambda_bound_loss * self.criterion_boundary(
+                outputs_boundary, boundary)
+
+        return loss
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/base.py b/Bank_second_part/detect_process/paddlevideo/modeling/losses/base.py
new file mode 100644
index 0000000..7284252
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/losses/base.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import  abstractmethod
+import paddle
+import paddle.nn as nn
+
+#XXX use _forward?? or forward??
+class BaseWeightedLoss(nn.Layer):
+    """Base class for loss.
+
+    All subclass should overwrite the ``_forward()`` method which returns the
+    normal loss without loss weights.
+
+    Args:
+        loss_weight (float): Factor scalar multiplied on the loss.
+            Default: 1.0.
+    """
+
+    def __init__(self, loss_weight=1.0):
+        super().__init__()
+        self.loss_weight = loss_weight
+
+    @abstractmethod
+    def _forward(self, *args, **kwargs):
+        pass
+
+    def forward(self, *args, **kwargs):
+        """Defines the computation performed at every call.
+        Args:
+            *args: The positional arguments for the corresponding
+                loss.
+            **kwargs: The keyword arguments for the corresponding
+                loss.
+        Returns:
+            paddle.Tensor: The calculated loss.
+        """
+        return self._forward(*args, **kwargs) * self.loss_weight
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/bmn_loss.py b/Bank_second_part/detect_process/paddlevideo/modeling/losses/bmn_loss.py
new file mode 100644
index 0000000..e434850
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/losses/bmn_loss.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+from ..registry import LOSSES
+from .base import BaseWeightedLoss
+
+
+@LOSSES.register()
+class BMNLoss(BaseWeightedLoss):
+    """Loss for BMN model
+    Args:
+        tscale (int): sequence length, default 100.
+        dscale (int): max duration length, default 100.
+    """
+    def __init__(self, dscale, tscale):
+        super().__init__()
+        self.dscale = dscale
+        self.tscale = tscale
+
+    def _get_mask(self, dscale, tscale):
+        bm_mask = []
+        for idx in range(dscale):
+            mask_vector = [1 for i in range(tscale - idx)
+                           ] + [0 for i in range(idx)]
+            bm_mask.append(mask_vector)
+        bm_mask = np.array(bm_mask, dtype='float32')
+        bm_mask = paddle.to_tensor(bm_mask)
+        bm_mask.stop_gradient = True
+        return bm_mask
+
+    def tem_loss_func(self, pred_start, pred_end, gt_start, gt_end):
+        def bi_loss(pred_score, gt_label, datatype):
+            pred_score = paddle.reshape(x=pred_score, shape=[-1])
+            gt_label = paddle.reshape(x=gt_label, shape=[-1])
+            gt_label.stop_gradient = True
+            pmask = paddle.cast(x=(gt_label > 0.5), dtype=datatype)
+            num_entries = paddle.cast(paddle.shape(pmask), dtype=datatype)
+            num_positive = paddle.cast(paddle.sum(pmask), dtype=datatype)
+            ratio = num_entries / num_positive
+            coef_0 = 0.5 * ratio / (ratio - 1)
+            coef_1 = 0.5 * ratio
+            epsilon = 0.000001
+            loss_pos = paddle.multiply(paddle.log(pred_score + epsilon), pmask)
+            loss_pos = coef_1 * paddle.mean(loss_pos)
+            loss_neg = paddle.multiply(paddle.log(1.0 - pred_score + epsilon),
+                                       (1.0 - pmask))
+            loss_neg = coef_0 * paddle.mean(loss_neg)
+            loss = -1 * (loss_pos + loss_neg)
+            return loss
+
+        loss_start = bi_loss(pred_start, gt_start, pred_start.dtype)
+        loss_end = bi_loss(pred_end, gt_end, pred_start.dtype)
+        loss = loss_start + loss_end
+        return loss
+
+    def pem_reg_loss_func(self, pred_score, gt_iou_map, mask):
+        gt_iou_map = paddle.multiply(gt_iou_map, mask)
+
+        u_hmask = paddle.cast(x=gt_iou_map > 0.7, dtype=pred_score.dtype)
+        u_mmask = paddle.logical_and(gt_iou_map <= 0.7, gt_iou_map > 0.3)
+        u_mmask = paddle.cast(x=u_mmask, dtype=pred_score.dtype)
+        u_lmask = paddle.logical_and(gt_iou_map <= 0.3, gt_iou_map >= 0.)
+        u_lmask = paddle.cast(x=u_lmask, dtype=pred_score.dtype)
+        u_lmask = paddle.multiply(u_lmask, mask)
+
+        num_h = paddle.cast(paddle.sum(u_hmask), dtype=pred_score.dtype)
+        num_m = paddle.cast(paddle.sum(u_mmask), dtype=pred_score.dtype)
+        num_l = paddle.cast(paddle.sum(u_lmask), dtype=pred_score.dtype)
+
+        r_m = num_h / num_m
+        u_smmask = paddle.uniform(shape=[
+            gt_iou_map.shape[1], gt_iou_map.shape[2]
+        ],
+                                  min=0.0,
+                                  max=1.0).astype(pred_score.dtype)
+        u_smmask = paddle.multiply(u_mmask, u_smmask)
+        u_smmask = paddle.cast(x=(u_smmask > (1. - r_m)),
+                               dtype=pred_score.dtype)
+
+        r_l = num_h / num_l
+        u_slmask = paddle.uniform(shape=[
+            gt_iou_map.shape[1], gt_iou_map.shape[2]
+        ],
+                                  min=0.0,
+                                  max=1.0).astype(pred_score.dtype)
+        u_slmask = paddle.multiply(u_lmask, u_slmask)
+        u_slmask = paddle.cast(x=(u_slmask > (1. - r_l)),
+                               dtype=pred_score.dtype)
+
+        weights = u_hmask + u_smmask + u_slmask
+        weights.stop_gradient = True
+        loss = F.square_error_cost(pred_score, gt_iou_map)
+        loss = paddle.multiply(loss, weights)
+        loss = 0.5 * paddle.sum(loss) / paddle.sum(weights)
+
+        return loss
+
+    def pem_cls_loss_func(self, pred_score, gt_iou_map, mask):
+        gt_iou_map = paddle.multiply(gt_iou_map, mask)
+        gt_iou_map.stop_gradient = True
+        pmask = paddle.cast(x=(gt_iou_map > 0.9), dtype=pred_score.dtype)
+        nmask = paddle.cast(x=(gt_iou_map <= 0.9), dtype=pred_score.dtype)
+        nmask = paddle.multiply(nmask, mask)
+
+        num_positive = paddle.sum(pmask)
+        num_entries = num_positive + paddle.sum(nmask)
+        ratio = num_entries / num_positive
+        coef_0 = 0.5 * ratio / (ratio - 1)
+        coef_1 = 0.5 * ratio
+        epsilon = 0.000001
+        loss_pos = paddle.multiply(paddle.log(pred_score + epsilon), pmask)
+        loss_pos = coef_1 * paddle.sum(loss_pos)
+        loss_neg = paddle.multiply(paddle.log(1.0 - pred_score + epsilon),
+                                   nmask)
+        loss_neg = coef_0 * paddle.sum(loss_neg)
+        loss = -1 * (loss_pos + loss_neg) / num_entries
+        return loss
+
+    def forward(self, pred_bm, pred_start, pred_end, gt_iou_map, gt_start,
+                gt_end):
+        pred_bm_reg = paddle.squeeze(paddle.slice(pred_bm,
+                                                  axes=[1],
+                                                  starts=[0],
+                                                  ends=[1]),
+                                     axis=[1])
+        pred_bm_cls = paddle.squeeze(paddle.slice(pred_bm,
+                                                  axes=[1],
+                                                  starts=[1],
+                                                  ends=[2]),
+                                     axis=[1])
+
+        bm_mask = self._get_mask(self.dscale, self.tscale)
+
+        pem_reg_loss = self.pem_reg_loss_func(pred_bm_reg, gt_iou_map, bm_mask)
+        pem_cls_loss = self.pem_cls_loss_func(pred_bm_cls, gt_iou_map, bm_mask)
+
+        tem_loss = self.tem_loss_func(pred_start, pred_end, gt_start, gt_end)
+
+        loss = tem_loss + 10 * pem_reg_loss + pem_cls_loss
+        return loss
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/cross_entropy_loss.py b/Bank_second_part/detect_process/paddlevideo/modeling/losses/cross_entropy_loss.py
new file mode 100644
index 0000000..953f77c
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/losses/cross_entropy_loss.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+
+from ..registry import LOSSES
+from .base import BaseWeightedLoss
+
+
+@LOSSES.register()
+class CrossEntropyLoss(BaseWeightedLoss):
+    """Cross Entropy Loss."""
+    def _forward(self, score, labels, **kwargs):
+        """Forward function.
+        Args:
+            score (paddle.Tensor): The class score.
+            labels (paddle.Tensor): The ground truth labels.
+            kwargs: Any keyword argument to be used to calculate
+                CrossEntropy loss.
+        Returns:
+            loss (paddle.Tensor): The returned CrossEntropy loss.
+        """
+        loss = F.cross_entropy(score, labels, **kwargs)
+        return loss
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/depth_loss.py b/Bank_second_part/detect_process/paddlevideo/modeling/losses/depth_loss.py
new file mode 100644
index 0000000..ba9a2cb
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/losses/depth_loss.py
@@ -0,0 +1,290 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+from ..registry import LOSSES
+from .base import BaseWeightedLoss
+
+
+def get_smooth_loss(disp, img):
+    """Computes the smoothness loss for a disparity image
+    The color image is used for edge-aware smoothness
+    """
+    grad_disp_x = paddle.abs(disp[:, :, :, :-1] - disp[:, :, :, 1:])
+    grad_disp_y = paddle.abs(disp[:, :, :-1, :] - disp[:, :, 1:, :])
+
+    grad_img_x = paddle.mean(paddle.abs(img[:, :, :, :-1] - img[:, :, :, 1:]),
+                             1,
+                             keepdim=True)
+    grad_img_y = paddle.mean(paddle.abs(img[:, :, :-1, :] - img[:, :, 1:, :]),
+                             1,
+                             keepdim=True)
+
+    grad_disp_x *= paddle.exp(-grad_img_x)
+    grad_disp_y *= paddle.exp(-grad_img_y)
+
+    return grad_disp_x.mean() + grad_disp_y.mean()
+
+
+class DiffLoss(nn.Layer):
+    def __init__(self):
+        super(DiffLoss, self).__init__()
+
+    def forward(self, input1, input2):
+        batch_size = input1.shape[0]
+        input1 = input1.reshape([batch_size, -1])
+        input2 = input2.reshape([batch_size, -1])
+
+        input1_l2 = input1
+        input2_l2 = input2
+
+        diff_loss = 0
+        dim = input1.shape[1]
+        for i in range(input1.shape[0]):
+            diff_loss = diff_loss + paddle.mean(
+                ((input1_l2[i:i + 1, :].mm(input2_l2[i:i + 1, :].T)).pow(2)) /
+                dim)
+
+        diff_loss = diff_loss / input1.shape[0]
+
+        return diff_loss
+
+
+class MSE(nn.Layer):
+    def __init__(self):
+        super(MSE, self).__init__()
+
+    def forward(self, pred, real):
+        diffs = paddle.add(real, -pred)
+        n = paddle.numel(diffs)
+        mse = paddle.sum(diffs.pow(2)) / n
+
+        return mse
+
+
+class SIMSE(nn.Layer):
+    def __init__(self):
+        super(SIMSE, self).__init__()
+
+    def forward(self, pred, real):
+        diffs = paddle.add(real, -pred)
+        n = paddle.numel(diffs)
+        simse = paddle.sum(diffs).pow(2) / (n**2)
+
+        return simse
+
+
+class SSIM(nn.Layer):
+    """Layer to compute the SSIM loss between a pair of images
+    """
+    def __init__(self):
+        super(SSIM, self).__init__()
+        self.mu_x_pool = nn.AvgPool2D(3, 1, exclusive=False)
+        self.mu_y_pool = nn.AvgPool2D(3, 1, exclusive=False)
+        self.sig_x_pool = nn.AvgPool2D(3, 1, exclusive=False)
+        self.sig_y_pool = nn.AvgPool2D(3, 1, exclusive=False)
+        self.sig_xy_pool = nn.AvgPool2D(3, 1, exclusive=False)
+
+        self.refl = nn.Pad2D(1, mode='reflect')
+
+        self.C1 = 0.01**2
+        self.C2 = 0.03**2
+
+    def forward(self, x, y):
+        x = self.refl(x)
+        y = self.refl(y)
+
+        mu_x = self.mu_x_pool(x)
+        mu_y = self.mu_y_pool(y)
+
+        sigma_x = self.sig_x_pool(x**2) - mu_x**2
+        sigma_y = self.sig_y_pool(y**2) - mu_y**2
+        sigma_xy = self.sig_xy_pool(x * y) - mu_x * mu_y
+
+        SSIM_n = (2 * mu_x * mu_y + self.C1) * (2 * sigma_xy + self.C2)
+        SSIM_d = (mu_x**2 + mu_y**2 + self.C1) * (sigma_x + sigma_y + self.C2)
+
+        return paddle.clip((1 - SSIM_n / SSIM_d) / 2, 0, 1)
+
+
+@LOSSES.register()
+class ADDSLoss(BaseWeightedLoss):
+    def __init__(self, avg_reprojection, disparity_smoothness, no_ssim):
+        super(ADDSLoss, self).__init__()
+        self.avg_reprojection = avg_reprojection
+        self.disparity_smoothness = disparity_smoothness
+        self.no_ssim = no_ssim
+
+        self.loss_diff = DiffLoss()
+        self.loss_recon1 = MSE()
+        self.loss_recon2 = SIMSE()
+        self.loss_similarity = MSE()
+
+    def compute_reprojection_loss(self, pred, target):
+        """Computes reprojection loss between a batch of predicted and target images
+        """
+        abs_diff = paddle.abs(target - pred)
+        l1_loss = abs_diff.mean(1, True)
+
+        if not self.no_ssim:
+            self.ssim = SSIM()
+
+        if self.no_ssim:
+            reprojection_loss = l1_loss
+        else:
+            ssim_loss = self.ssim(pred, target).mean(1, True)
+            reprojection_loss = 0.85 * ssim_loss + 0.15 * l1_loss
+
+        return reprojection_loss
+
+    def compute_losses(self, inputs, outputs, is_night):
+        """Compute the reprojection and smoothness losses for a minibatch
+        """
+        losses = {}
+        total_loss = 0
+
+        for scale in outputs['scales']:
+            loss = 0
+            reprojection_losses = []
+
+            source_scale = 0
+
+            disp = outputs[("disp", scale)]
+            if is_night:
+                color = inputs[("color_n", 0, scale)]
+                target = inputs[("color_n", 0, source_scale)]
+            else:
+                color = inputs[("color", 0, scale)]
+                target = inputs[("color", 0, source_scale)]
+
+            for frame_id in outputs['frame_ids'][1:]:
+                pred = outputs[("color", frame_id, scale)]
+                reprojection_losses.append(
+                    self.compute_reprojection_loss(pred, target))
+
+            reprojection_losses = paddle.concat(reprojection_losses, 1)
+
+            identity_reprojection_losses = []
+            for frame_id in outputs['frame_ids'][1:]:
+                if is_night:
+                    pred = inputs[("color_n", frame_id, source_scale)]
+                else:
+                    pred = inputs[("color", frame_id, source_scale)]
+                identity_reprojection_losses.append(
+                    self.compute_reprojection_loss(pred, target))
+
+            identity_reprojection_losses = paddle.concat(
+                identity_reprojection_losses, 1)
+
+            if self.avg_reprojection:
+                identity_reprojection_loss = identity_reprojection_losses.mean(
+                    1, keepdim=True)
+            else:
+                # save both images, and do min all at once below
+                identity_reprojection_loss = identity_reprojection_losses
+
+            if self.avg_reprojection:
+                reprojection_loss = reprojection_losses.mean(1, keepdim=True)
+            else:
+                reprojection_loss = reprojection_losses
+
+            # add random numbers to break ties
+            identity_reprojection_loss = identity_reprojection_loss + paddle.randn(
+                identity_reprojection_loss.shape) * 0.00001
+
+            combined = paddle.concat(
+                (identity_reprojection_loss, reprojection_loss), axis=1)
+            if combined.shape[1] == 1:
+                to_optimise = combined
+            else:
+                to_optimise = paddle.min(combined, axis=1)
+
+            loss = loss + to_optimise.mean()
+
+            mean_disp = disp.mean(2, True).mean(3, True)
+            norm_disp = disp / (mean_disp + 1e-7)
+            smooth_loss = get_smooth_loss(norm_disp, color)
+
+            loss = loss + self.disparity_smoothness * smooth_loss / (2**scale)
+            total_loss = total_loss + loss
+            losses["loss/{}".format(scale)] = loss
+
+        total_loss /= len(outputs['scales'])
+        losses["loss"] = total_loss
+        return losses
+
+    def forward(self, inputs, outputs):
+
+        losses_day = self.compute_losses(inputs, outputs, 'day')
+        losses_night = self.compute_losses(inputs, outputs['outputs_night'],
+                                           'night')
+
+        loss = 0
+        losses = []
+        # diff
+        target_diff1 = 0.5 * self.loss_diff(
+            outputs['result'][0], outputs['result'][2])  # 10 when batchsize=1
+        target_diff2 = 0.5 * self.loss_diff(outputs['result_night'][0],
+                                            outputs['result_night'][2])
+        losses.append(target_diff1)
+        losses.append(target_diff2)
+        loss = loss + target_diff1
+        loss = loss + target_diff2
+
+        target_diff3 = 1 * self.loss_diff(
+            outputs['result'][1], outputs['result'][3])  # 10 when batchsize=1
+        target_diff4 = 1 * self.loss_diff(outputs['result_night'][1],
+                                          outputs['result_night'][3])
+        losses.append(target_diff3)
+        losses.append(target_diff4)
+        loss = loss + target_diff3
+        loss = loss + target_diff4
+
+        # recon
+        target_mse = 1 * self.loss_recon1(outputs['result'][5],
+                                          inputs["color_aug", 0, 0])
+        loss = loss + target_mse
+
+        target_simse = 1 * self.loss_recon2(outputs['result'][5],
+                                            inputs["color_aug", 0, 0])
+        loss = loss + target_simse
+
+        losses.append(target_mse)
+        losses.append(target_simse)
+        target_mse_night = 1 * self.loss_recon1(outputs['result_night'][5],
+                                                inputs["color_n_aug", 0, 0])
+        loss = loss + target_mse_night
+
+        target_simse_night = 1 * self.loss_recon2(outputs['result_night'][5],
+                                                  inputs["color_n_aug", 0, 0])
+        loss = loss + target_simse_night
+
+        losses.append(target_mse_night)
+        losses.append(target_simse_night)
+
+        # depth loss
+        pseudo_label = outputs[("disp", 0)].detach()
+        depth_loss = 1 * self.loss_similarity(
+            outputs['outputs_night'][("disp", 0)], pseudo_label)
+        loss = loss + depth_loss
+
+        losses.append(depth_loss)
+
+        outputs['loss'] = loss + losses_day['loss'] + losses_night['loss']
+        outputs['losses_day'] = losses_day['loss']
+        outputs['losses_night'] = losses_night['loss']
+
+        return outputs
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/distillation_loss.py b/Bank_second_part/detect_process/paddlevideo/modeling/losses/distillation_loss.py
new file mode 100644
index 0000000..d27f941
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/losses/distillation_loss.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+import paddle.nn as nn
+
+from ..registry import LOSSES
+from .base import BaseWeightedLoss
+
+
+@LOSSES.register()
+class DistillationCELoss(BaseWeightedLoss):
+    """Distillation Entropy Loss."""
+    def _forward(self, score, labels, **kwargs):
+        """Forward function.
+        Args:
+            score (paddle.Tensor): The class score.
+            labels (paddle.Tensor): The ground truth labels.
+            kwargs: Any keyword argument to be used to calculate
+                CrossEntropy loss.
+        Returns:
+            loss (paddle.Tensor): The returned CrossEntropy loss.
+        """
+        if len(labels) == 1:
+            label = labels[0]
+            loss = F.cross_entropy(score, label, **kwargs)
+        # Deal with VideoMix
+        elif len(labels) == 3:
+            label_a, label_b, lam = labels
+            loss_a = F.cross_entropy(score, label_a, **kwargs)
+            loss_b = F.cross_entropy(score, label_b, **kwargs)
+            loss = lam * loss_a + (1 - lam) * loss_b
+            loss = paddle.mean(loss)  #lam shape is bs
+        return loss
+
+
+@LOSSES.register()
+class DistillationDMLLoss(BaseWeightedLoss):
+    """
+    DistillationDMLLoss
+    """
+    def __init__(self, act="softmax", eps=1e-12, **kargs):
+        super().__init__(**kargs)
+        if act is not None:
+            assert act in ["softmax", "sigmoid"]
+        if act == "softmax":
+            self.act = nn.Softmax(axis=-1)
+        elif act == "sigmoid":
+            self.act = nn.Sigmoid()
+        else:
+            self.act = None
+        self.eps = eps
+
+    def _kldiv(self, x, target):
+        class_num = x.shape[-1]
+        cost = target * paddle.log(
+            (target + self.eps) / (x + self.eps)) * class_num
+        return cost
+
+    def _forward(self, x, target):
+        if self.act is not None:
+            x = self.act(x)
+            target = self.act(target)
+        loss = self._kldiv(x, target) + self._kldiv(target, x)
+        loss = loss / 2
+        loss = paddle.mean(loss)
+        return loss
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/transnetv2_loss.py b/Bank_second_part/detect_process/paddlevideo/modeling/losses/transnetv2_loss.py
new file mode 100644
index 0000000..624c468
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/losses/transnetv2_loss.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+from ..registry import LOSSES
+from .base import BaseWeightedLoss
+
+
+@LOSSES.register()
+class TransNetV2Loss(BaseWeightedLoss):
+    """Loss for TransNetV2 model
+    """
+    def __init__(self, transition_weight=5.0, many_hot_loss_weight=0.1):
+        self.transition_weight = transition_weight
+        self.many_hot_loss_weight = many_hot_loss_weight
+        super().__init__()
+
+    def _forward(self, one_hot_pred, one_hot_gt,
+                many_hot_pred=None, many_hot_gt=None, reg_losses=None):
+        assert transition_weight != 1
+
+        one_hot_pred = one_hot_pred[:, :, 0]
+
+        one_hot_gt = one_hot_gt.astype('float32')
+        one_hot_loss = F.binary_cross_entropy_with_logits(logit=one_hot_pred, label=one_hot_gt, reduction='none')
+
+        one_hot_loss *= 1 + one_hot_gt * (transition_weight - 1)
+
+        one_hot_loss = paddle.mean(one_hot_loss)
+
+        many_hot_loss = 0.
+        if many_hot_loss_weight != 0. and many_hot_pred is not None:
+            many_hot_loss = many_hot_loss_weight * paddle.mean(
+                F.binary_cross_entropy_with_logits(logit=many_hot_pred[:, :, 0],
+                                                   label=many_hot_gt.astype('float32'), reduction='none'))
+
+        total_loss = one_hot_loss + many_hot_loss
+
+        if reg_losses is not None:
+            for name, value in reg_losses.items():
+                if value is not None:
+                    total_loss += value
+
+        return total_loss
\ No newline at end of file
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/yowo_loss.py b/Bank_second_part/detect_process/paddlevideo/modeling/losses/yowo_loss.py
new file mode 100644
index 0000000..5ca3290
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/losses/yowo_loss.py
@@ -0,0 +1,251 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+import paddle
+import paddle.nn.functional as F
+import paddle.nn as nn
+from paddle.static import Variable
+
+from ..registry import LOSSES
+from .base import BaseWeightedLoss
+from ..framework.localizers.yowo_utils import build_targets
+
+
+class FocalLoss(nn.Layer):
+    """
+        This criterion is a implemenation of Focal Loss, which is proposed in
+        Focal Loss for Dense Object Detection.
+
+            Loss(x, class) = - \alpha (1-softmax(x)[class])^gamma \log(softmax(x)[class])
+
+        The losses are averaged across observations for each minibatch.
+
+        Args:
+            alpha(1D Tensor, Variable) : the scalar factor for this criterion
+            gamma(float, double) : gamma > 0; reduces the relative loss for well-classiﬁed examples (p > .5),
+                                   putting more focus on hard, misclassiﬁed examples
+            size_average(bool): size_average(bool): By default, the losses are averaged over observations for each minibatch.
+                                However, if the field size_average is set to False, the losses are
+                                instead summed for each minibatch.
+
+    """
+
+    def __init__(self, class_num, alpha=None, gamma=2, size_average=True):
+        super(FocalLoss, self).__init__()
+
+        if alpha is None:
+            self.alpha = paddle.ones(
+                [class_num, 1])
+            self.alpha.stop_gradient = False
+        else:
+            if isinstance(alpha, Variable):
+                self.alpha = alpha
+            else:
+                self.alpha = (alpha)
+                self.alpha.stop_gradient = False
+        self.gamma = gamma
+        self.class_num = class_num
+        self.size_average = size_average
+
+    def forward(self, inputs, targets):
+        N = inputs.shape[0]
+        C = inputs.shape[1]
+        P = F.softmax(inputs, axis=1)
+
+        tmp = numpy.zeros((N, C))
+        class_mask = paddle.to_tensor(tmp, place=inputs.place)
+        class_mask.stop_gradient = False
+        ids = paddle.reshape(targets, [-1, 1])
+        class_mask = F.one_hot(ids.squeeze(-1), class_mask.shape[1])
+
+        if "Place" not in str(inputs.place) and "Place" not in str(self.alpha.place):
+            self.alpha = self.alpha.cuda()
+
+        alpha = self.alpha[paddle.reshape(ids.detach(), [-1])]
+
+        probs = paddle.reshape((P * class_mask).sum(1), [-1, 1])
+
+        log_p = probs.log()
+
+        batch_loss = -alpha * (paddle.pow((1 - probs), self.gamma)) * log_p
+
+        if self.size_average:
+            loss = batch_loss.mean()
+        else:
+            loss = batch_loss.sum()
+        return loss
+
+
+@LOSSES.register()
+class RegionLoss(BaseWeightedLoss):
+    # for our model anchors has 10 values and number of anchors is 5
+    # parameters: 24, 10 float values, 24, 5
+    def __init__(self, num_classes, anchors, num_anchors, object_scale, noobject_scale, class_scale, coord_scale):
+        super().__init__()
+        self.num_classes = num_classes
+        self.anchors = [float(x) for x in anchors]
+        self.num_anchors = num_anchors
+        self.anchor_step = len(self.anchors) // self.num_anchors  # each anchor has 2 parameters
+        self.object_scale = object_scale
+        self.noobject_scale = noobject_scale
+        self.class_scale = class_scale
+        self.coord_scale = coord_scale
+        self.focalloss = FocalLoss(class_num=self.num_classes, gamma=2, size_average=False)
+        self.thresh = 0.6
+
+    def convert2cpu(self, gpu_matrix):
+        # return paddle.to_tensor((gpu_matrix.shape), dtype="float32").copy_(gpu_matrix)
+        return gpu_matrix.cpu()
+
+    def forward(self, output, target):
+        # output : B*A*(4+1+num_classes)*H*W            8*5*29*24*24
+        # B: number of batches
+        # A: number of anchors
+        # 4: 4 parameters for each bounding box
+        # 1: confidence score
+        # num_classes
+        # H: height of the image (in grids)
+        # W: width of the image (in grids)
+        # for each grid cell, there are A*(4+1+num_classes) parameters
+        nB = output.detach().shape[0]  # batch
+        nA = self.num_anchors  # anchor_num
+        nC = self.num_classes
+        nH = output.detach().shape[2]
+        nW = output.detach().shape[3]
+
+        # resize the output (all parameters for each anchor can be reached)
+        output = paddle.reshape(output, [nB, nA, (5 + nC), nH, nW])
+        # anchor's parameter tx
+
+        x = F.sigmoid(
+            paddle.reshape(paddle.index_select(output, paddle.to_tensor([0], dtype='int64').cuda(), axis=2),
+                           [nB, nA, nH, nW]))
+        x.stop_gradient = False
+        # anchor's parameter ty
+        y = F.sigmoid(
+            paddle.reshape(paddle.index_select(output, paddle.to_tensor([1], dtype='int64').cuda(), axis=2),
+                           [nB, nA, nH, nW]))
+        y.stop_gradient = False
+        # anchor's parameter tw
+        w = paddle.reshape(paddle.index_select(output, paddle.to_tensor([2], dtype='int64').cuda(), axis=2),
+                           [nB, nA, nH, nW])
+        w.stop_gradient = False
+        # anchor's parameter th
+        h = paddle.reshape(paddle.index_select(output, paddle.to_tensor([3], dtype='int64').cuda(), axis=2),
+                           [nB, nA, nH, nW])
+        h.stop_gradient = False
+        # confidence score for each anchor
+        conf = F.sigmoid(
+            paddle.reshape(paddle.index_select(output, paddle.to_tensor([4], dtype='int64').cuda(), axis=2),
+                           [nB, nA, nH, nW]))
+        conf.stop_gradient = False
+        # anchor's parameter class label
+        cls = paddle.index_select(output, paddle.linspace(5, 5 + nC - 1, nC, 'int64').cuda(), axis=2)
+        cls.stop_gradient = False
+        # resize the data structure so that for every anchor there is a class label in the last dimension
+        cls = paddle.reshape(paddle.transpose(paddle.reshape(cls, [nB * nA, nC, nH * nW]), [0, 2, 1]),
+                             [nB * nA * nH * nW, nC])
+
+        # for the prediction of localization of each bounding box, there exist 4 parameters (tx, ty, tw, th)
+        # pred_boxes = torch.cuda.FloatTensor(4, nB*nA*nH*nW)
+        pred_boxes = paddle.zeros([4, nB * nA * nH * nW], dtype='float32').cuda()
+        # tx and ty
+        grid_x = paddle.reshape(paddle.tile(paddle.tile(paddle.linspace(0, nW - 1, nW), [nH, 1]), [nB * nA, 1, 1]),
+                                [nB * nA * nH * nW]).cuda()
+        grid_y = paddle.reshape(paddle.tile(paddle.tile(paddle.linspace(0, nH - 1, nH), [nW, 1]).t(), [nB * nA, 1, 1]),
+                                [nB * nA * nH * nW]).cuda()
+        # for each anchor there are anchor_step variables (with the structure num_anchor*anchor_step)
+        # for each row(anchor), the first variable is anchor's width, second is anchor's height
+        # pw and ph
+        anchor_w = paddle.index_select(paddle.reshape(paddle.to_tensor(self.anchors), [nA, self.anchor_step]),
+                                       paddle.to_tensor([0], dtype='int64'), axis=1).cuda()
+        anchor_h = paddle.index_select(paddle.reshape(paddle.to_tensor(self.anchors), [nA, self.anchor_step]),
+                                       paddle.to_tensor([1], dtype='int64'), axis=1).cuda()
+        # for each pixel (grid) repeat the above process (obtain width and height of each grid)
+        anchor_w = paddle.reshape(paddle.tile(paddle.tile(anchor_w, [nB, 1]), [1, 1, nH * nW]), [nB * nA * nH * nW])
+        anchor_h = paddle.reshape(paddle.tile(paddle.tile(anchor_h, [nB, 1]), [1, 1, nH * nW]), [nB * nA * nH * nW])
+        # prediction of bounding box localization
+        # x.data and y.data: top left corner of the anchor
+        # grid_x, grid_y: tx and ty predictions made by yowo
+
+        x_data = paddle.reshape(x.detach(), [-1])
+        y_data = paddle.reshape(y.detach(), [-1])
+        w_data = paddle.reshape(w.detach(), [-1])
+        h_data = paddle.reshape(h.detach(), [-1])
+
+        pred_boxes[0] = paddle.cast(x_data, dtype='float32') + paddle.cast(grid_x, dtype='float32')  # bx
+        pred_boxes[1] = paddle.cast(y_data, dtype='float32') + paddle.cast(grid_y, dtype='float32')  # by
+        pred_boxes[2] = paddle.exp(paddle.cast(w_data, dtype='float32')) * paddle.cast(anchor_w, dtype='float32')  # bw
+        pred_boxes[3] = paddle.exp(paddle.cast(h_data, dtype='float32')) * paddle.cast(anchor_h, dtype='float32')  # bh
+        # the size -1 is inferred from other dimensions
+        # pred_boxes (nB*nA*nH*nW, 4)
+
+        pred_boxes = self.convert2cpu(
+            paddle.cast(paddle.reshape(paddle.transpose(pred_boxes, (1, 0)), [-1, 4]), dtype='float32'))
+
+        nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes,
+                                                                                                    target.detach(),
+                                                                                                    self.anchors, nA,
+                                                                                                    nC, \
+                                                                                                    nH, nW,
+                                                                                                    self.noobject_scale,
+                                                                                                    self.object_scale,
+                                                                                                    self.thresh)
+        cls_mask = (cls_mask == 1)
+        #  keep those with high box confidence scores (greater than 0.25) as our final predictions
+        nProposals = int((conf > 0.25).sum().detach().item())
+
+        tx = (tx).cuda()
+        tx.stop_gradient = False
+        ty = ty.cuda()
+        ty.stop_gradient = False
+        tw = tw.cuda()
+        tw.stop_gradient = False
+        th = th.cuda()
+        th.stop_gradient = False
+        tconf = tconf.cuda()
+        tconf.stop_gradient = False
+
+        tcls = paddle.reshape(tcls, [-1]).astype('int64')[paddle.reshape(cls_mask, [-1])].cuda()
+        tcls.stop_gradient = False
+
+        coord_mask = coord_mask.cuda()
+        coord_mask.stop_gradient = False
+        conf_mask = conf_mask.cuda().sqrt()
+        coord_mask.stop_gradient = False
+        cls_mask = paddle.tile(paddle.reshape(cls_mask, [-1, 1]), [1, nC]).cuda()
+        cls_mask.stop_gradient = False
+
+        cls = paddle.reshape(cls[cls_mask], [-1, nC])
+
+        # losses between predictions and targets (ground truth)
+        # In total 6 aspects are considered as losses:
+        # 4 for bounding box location, 2 for prediction confidence and classification seperately
+        L1_loss = nn.SmoothL1Loss(reduction='sum')
+        loss_x = self.coord_scale * L1_loss(paddle.cast(x, dtype="float32") * coord_mask, tx * coord_mask) / 2.0
+        loss_y = self.coord_scale * L1_loss(paddle.cast(y, dtype="float32") * coord_mask, ty * coord_mask) / 2.0
+        loss_w = self.coord_scale * L1_loss(paddle.cast(w * coord_mask, dtype="float32"), tw * coord_mask) / 2.0
+        loss_h = self.coord_scale * L1_loss(paddle.cast(h * coord_mask, dtype="float32"), th * coord_mask) / 2.0
+        loss_conf = nn.MSELoss(reduction='sum')(paddle.cast(conf, dtype="float32") * conf_mask, tconf * conf_mask) / 2.0
+
+        # try focal loss with gamma = 2
+        loss_cls = self.class_scale * self.focalloss(cls, tcls)
+
+        # sum of loss
+        loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
+
+        return loss, nCorrect
+
+
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/registry.py b/Bank_second_part/detect_process/paddlevideo/modeling/registry.py
new file mode 100644
index 0000000..b8140e1
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/registry.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils import Registry
+
+BACKBONES = Registry('backbone')
+HEADS = Registry('head')
+RECOGNIZERS = Registry('recognizer')
+SEGMENTERS = Registry('Segmenters')
+LOCALIZERS = Registry('localizer')
+PARTITIONERS = Registry('partitioner')
+LOSSES = Registry('loss')
+ROI_EXTRACTORS = Registry('roi_extractor')
+DETECTORS = Registry('detectors')
+BBOX_ASSIGNERS = Registry('bbox_assigner')
+BBOX_SAMPLERS = Registry('bbox_sampler')
+BBOX_CODERS = Registry('bbox_coder')
+ESTIMATORS = Registry('estimator')
+MULTIMODAL = Registry('multimodal')
+SEGMENT = Registry('segment')
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/samplers/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/samplers/__init__.py
new file mode 100644
index 0000000..0cf7f15
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/samplers/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .random_sampler import RandomSampler
+
+__all__ = ['RandomSampler']
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/samplers/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/samplers/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..0155487
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/samplers/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/samplers/__pycache__/random_sampler.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/samplers/__pycache__/random_sampler.cpython-310.pyc
new file mode 100644
index 0000000..5598aaa
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/samplers/__pycache__/random_sampler.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/samplers/random_sampler.py b/Bank_second_part/detect_process/paddlevideo/modeling/samplers/random_sampler.py
new file mode 100644
index 0000000..4808454
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/samplers/random_sampler.py
@@ -0,0 +1,146 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import numpy as np
+from ..registry import BBOX_SAMPLERS
+
+class SamplingResult():
+    """Bbox sampling result.  """
+
+    def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result,
+                 gt_flags):
+        self.pos_inds = pos_inds
+        self.neg_inds = neg_inds
+        self.pos_bboxes = paddle.index_select(bboxes,pos_inds)
+        
+        # neg_inds may be empty
+        if neg_inds.shape[0]!=0:
+            self.neg_bboxes = paddle.index_select(bboxes,neg_inds)
+        else:
+            self.neg_bboxes=None
+        
+        self.pos_is_gt  = paddle.index_select(gt_flags,pos_inds)
+        self.num_gts = gt_bboxes.shape[0]
+        self.pos_assigned_gt_inds = paddle.index_select(assign_result.gt_inds,pos_inds) - 1
+
+        if float(gt_bboxes.numel()) == 0:
+            assert self.pos_assigned_gt_inds.numel() == 0
+            self.pos_gt_bboxes = paddle.empty_like(gt_bboxes).view(-1, 4)
+        else:
+            if len(gt_bboxes.shape) < 2:
+                gt_bboxes = gt_bboxes.view(-1, 4)
+
+            self.pos_gt_bboxes = paddle.index_select(gt_bboxes, self.pos_assigned_gt_inds)
+
+        if assign_result.labels is not None:
+            self.pos_gt_labels = paddle.index_select(assign_result.labels, pos_inds)
+        else:
+            self.pos_gt_labels = None
+
+    @property
+    def bboxes(self):
+        if self.neg_bboxes is not None:
+            ret = paddle.concat([self.pos_bboxes, self.neg_bboxes])
+        else:
+            # neg bbox may be empty
+            ret = self.pos_bboxes
+        return ret
+
+
+
+@BBOX_SAMPLERS.register()
+class RandomSampler():
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 neg_pos_ub=-1,
+                 add_gt_as_proposals=True,
+                 **kwargs):
+        self.num = num
+        self.pos_fraction = pos_fraction
+        self.neg_pos_ub = neg_pos_ub
+        self.add_gt_as_proposals = add_gt_as_proposals
+ 
+    def sample(self,
+               assign_result,
+               bboxes,
+               gt_bboxes,
+               gt_labels=None,
+               **kwargs):
+        """Sample positive and negative bboxes.  """
+
+        if len(bboxes.shape) < 2:
+            bboxes = bboxes[None, :]
+
+        bboxes = bboxes[:, :4]
+
+        gt_flags = paddle.full([bboxes.shape[0], ], 0, dtype='int32')
+        if self.add_gt_as_proposals and len(gt_bboxes) > 0:
+            if gt_labels is None:
+                raise ValueError(
+                    'gt_labels must be given when add_gt_as_proposals is True')
+            bboxes = paddle.concat([gt_bboxes, bboxes])
+            assign_result.add_gt_(gt_labels)
+            gt_ones = paddle.full([gt_bboxes.shape[0], ], 1, dtype='int32')
+            gt_flags = paddle.concat([gt_ones, gt_flags])
+
+        #1. 得到正样本的数量, inds
+        num_expected_pos = int(self.num * self.pos_fraction)
+        pos_inds = self._sample_pos( assign_result, num_expected_pos, bboxes=bboxes, **kwargs)
+        pos_inds = paddle.to_tensor(np.unique(pos_inds.numpy()))
+
+        #2. 得到负样本的数量, inds
+        num_sampled_pos = pos_inds.numel()
+        num_expected_neg = self.num - num_sampled_pos
+        neg_inds = self._sample_neg(
+            assign_result, num_expected_neg, bboxes=bboxes, **kwargs)
+        neg_inds = paddle.to_tensor(np.unique(neg_inds.numpy()))
+
+        #3. 得到sampling result
+        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
+                                         assign_result, gt_flags)
+        return sampling_result
+    def random_choice(self, gallery, num):
+        """Random select some elements from the gallery.  """
+        assert len(gallery) >= num
+
+        perm = paddle.arange(gallery.numel())[:num]
+        perm = paddle.randperm(gallery.numel())[:num] 
+        rand_inds = paddle.index_select(gallery, perm)
+        return rand_inds
+
+    def _sample_pos(self, assign_result, num_expected, **kwargs):
+        """Randomly sample some positive samples."""
+        #1.首先看一下给的bboxes里面有哪些label是大于0的 得到了他们的index
+        pos_inds = paddle.nonzero(assign_result.gt_inds, as_tuple=False)
+
+        #2. 只要这个pos_inds的数目不是0个 这些就都可以是positive sample
+        # 当pos_inds的数目小于num_expected(想要的sample的最大数目), 就直接用这个pos_inds
+        # 反之就从这么多index里随机采样num_expected个出来
+        if float(pos_inds.numel()) != 0:
+            pos_inds = pos_inds.squeeze() 
+        if float(pos_inds.numel()) <= num_expected:
+            return pos_inds
+        else:
+            return self.random_choice(pos_inds, num_expected)
+
+    def _sample_neg(self, assign_result, num_expected, **kwargs):
+        """Randomly sample some negative samples."""
+        neg_inds = paddle.nonzero(assign_result.gt_inds == 0, as_tuple=False)
+        if float(neg_inds.numel()) != 0:
+            neg_inds = neg_inds.squeeze() 
+        if (float(neg_inds.numel())) <= float(num_expected):
+            return neg_inds
+        else:
+            return self.random_choice(neg_inds, num_expected)
diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/weight_init.py b/Bank_second_part/detect_process/paddlevideo/modeling/weight_init.py
new file mode 100644
index 0000000..4722895
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/modeling/weight_init.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn.initializer as init
+import numpy as np
+from scipy import special
+
+
+def weight_init_(layer,
+                 func,
+                 weight_name=None,
+                 bias_name=None,
+                 bias_value=0.0,
+                 **kwargs):
+    """
+    In-place params init function.
+    Usage:
+    .. code-block:: python
+
+        import paddle
+        import numpy as np
+
+        data = np.ones([3, 4], dtype='float32')
+        linear = paddle.nn.Linear(4, 4)
+        input = paddle.to_tensor(data)
+        print(linear.weight)
+        linear(input)
+
+        weight_init_(linear, 'Normal', 'fc_w0', 'fc_b0', std=0.01, mean=0.1)
+        print(linear.weight)
+    """
+
+    if hasattr(layer, 'weight') and layer.weight is not None:
+        getattr(init, func)(**kwargs)(layer.weight)
+        if weight_name is not None:
+            # override weight name
+            layer.weight.name = weight_name
+
+    if hasattr(layer, 'bias') and layer.bias is not None:
+        init.Constant(bias_value)(layer.bias)
+        if bias_name is not None:
+            # override bias name
+            layer.bias.name = bias_name
+
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        print("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+              "The distribution of values may be incorrect.")
+
+    with paddle.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to [2l-1, 2u-1].
+        tmp = np.random.uniform(2 * l - 1, 2 * u - 1,
+                                size=list(tensor.shape)).astype(np.float32)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tmp = special.erfinv(tmp)
+
+        # Transform to proper mean, std
+        tmp *= (std * math.sqrt(2.0))
+        tmp += mean
+
+        # Clamp to ensure it's in the proper range
+        tmp = np.clip(tmp, a, b)
+        tensor.set_value(paddle.to_tensor(tmp))
+
+        return tensor
+
+
+def _calculate_fan_in_and_fan_out(tensor):
+    dimensions = tensor.dim()
+    if dimensions < 2:
+        raise ValueError(
+            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
+        )
+
+    num_input_fmaps = tensor.shape[1]
+    num_output_fmaps = tensor.shape[0]
+    receptive_field_size = 1
+    if tensor.dim() > 2:
+        receptive_field_size = tensor[0][0].numel()
+    fan_in = num_input_fmaps * receptive_field_size
+    fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+
+def kaiming_normal_(tensor, a=0., mode='fan_in', nonlinearity='leaky_relu'):
+    def _calculate_correct_fan(tensor, mode):
+        mode = mode.lower()
+        valid_modes = ['fan_in', 'fan_out']
+        if mode not in valid_modes:
+            raise ValueError(
+                "Mode {} not supported, please use one of {}".format(
+                    mode, valid_modes))
+
+        fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+        return fan_in if mode == 'fan_in' else fan_out
+
+    def calculate_gain(nonlinearity, param=None):
+        linear_fns = [
+            'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',
+            'conv_transpose2d', 'conv_transpose3d'
+        ]
+        if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
+            return 1
+        elif nonlinearity == 'tanh':
+            return 5.0 / 3
+        elif nonlinearity == 'relu':
+            return math.sqrt(2.0)
+        elif nonlinearity == 'leaky_relu':
+            if param is None:
+                negative_slope = 0.01
+            elif not isinstance(param, bool) and isinstance(
+                    param, int) or isinstance(param, float):
+                negative_slope = param
+            else:
+                raise ValueError(
+                    "negative_slope {} not a valid number".format(param))
+            return math.sqrt(2.0 / (1 + negative_slope**2))
+        else:
+            raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+
+    fan = _calculate_correct_fan(tensor, mode)
+    gain = calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    with paddle.no_grad():
+        paddle.nn.initializer.Normal(0, std)(tensor)
+        return tensor
diff --git a/Bank_second_part/detect_process/paddlevideo/solver/__init__.py b/Bank_second_part/detect_process/paddlevideo/solver/__init__.py
new file mode 100644
index 0000000..01cf9cd
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/solver/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import build_optimizer
+from .lr import build_lr
diff --git a/Bank_second_part/detect_process/paddlevideo/solver/custom_lr.py b/Bank_second_part/detect_process/paddlevideo/solver/custom_lr.py
new file mode 100644
index 0000000..bbf8d74
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/solver/custom_lr.py
@@ -0,0 +1,338 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from paddle.optimizer.lr import *
+import numpy as np
+"""
+PaddleVideo Learning Rate Schedule:
+You can use paddle.optimizer.lr
+or define your custom_lr in this file.
+"""
+
+
+class CustomWarmupCosineDecay(LRScheduler):
+    r"""
+    We combine warmup and stepwise-cosine which is used in slowfast model.
+
+    Args:
+        warmup_start_lr (float): start learning rate used in warmup stage.
+        warmup_epochs (int): the number epochs of warmup.
+        cosine_base_lr (float|int, optional): base learning rate in cosine schedule.
+        max_epoch (int): total training epochs.
+        num_iters(int): number iterations of each epoch.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+    Returns:
+        ``CosineAnnealingDecay`` instance to schedule learning rate.
+    """
+
+    def __init__(self,
+                 warmup_start_lr,
+                 warmup_epochs,
+                 cosine_base_lr,
+                 max_epoch,
+                 num_iters,
+                 last_epoch=-1,
+                 verbose=False):
+        self.warmup_start_lr = warmup_start_lr
+        self.warmup_epochs = warmup_epochs
+        self.cosine_base_lr = cosine_base_lr
+        self.max_epoch = max_epoch
+        self.num_iters = num_iters
+        #call step() in base class, last_lr/last_epoch/base_lr will be update
+        super(CustomWarmupCosineDecay, self).__init__(last_epoch=last_epoch,
+                                                      verbose=verbose)
+
+    def step(self, epoch=None):
+        """
+        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
+        The new learning rate will take effect on next ``optimizer.step`` .
+        Args:
+            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
+        Returns:
+            None
+        """
+        if epoch is None:
+            if self.last_epoch == -1:
+                self.last_epoch += 1
+            else:
+                self.last_epoch += 1 / self.num_iters  # update step with iters
+        else:
+            self.last_epoch = epoch
+        self.last_lr = self.get_lr()
+
+        if self.verbose:
+            print('Epoch {}: {} set learning rate to {}.'.format(
+                self.last_epoch, self.__class__.__name__, self.last_lr))
+
+    def _lr_func_cosine(self, cur_epoch, cosine_base_lr, max_epoch):
+        return cosine_base_lr * (math.cos(math.pi * cur_epoch / max_epoch) +
+                                 1.0) * 0.5
+
+    def get_lr(self):
+        """Define lr policy"""
+        lr = self._lr_func_cosine(self.last_epoch, self.cosine_base_lr,
+                                  self.max_epoch)
+        lr_end = self._lr_func_cosine(self.warmup_epochs, self.cosine_base_lr,
+                                      self.max_epoch)
+
+        # Perform warm up.
+        if self.last_epoch < self.warmup_epochs:
+            lr_start = self.warmup_start_lr
+            alpha = (lr_end - lr_start) / self.warmup_epochs
+            lr = self.last_epoch * alpha + lr_start
+        return lr
+
+
+class CustomWarmupPiecewiseDecay(LRScheduler):
+    r"""
+    This op combine warmup and stepwise-cosine which is used in slowfast model.
+
+    Args:
+        warmup_start_lr (float): start learning rate used in warmup stage.
+        warmup_epochs (int): the number epochs of warmup.
+        step_base_lr (float|int, optional): base learning rate in step schedule.
+        max_epoch (int): total training epochs.
+        num_iters(int): number iterations of each epoch.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+    Returns:
+        ``CustomWarmupPiecewiseDecay`` instance to schedule learning rate.
+    """
+
+    def __init__(self,
+                 warmup_start_lr,
+                 warmup_epochs,
+                 step_base_lr,
+                 lrs,
+                 gamma,
+                 steps,
+                 max_epoch,
+                 num_iters,
+                 last_epoch=0,
+                 verbose=False):
+        self.warmup_start_lr = warmup_start_lr
+        self.warmup_epochs = warmup_epochs
+        self.step_base_lr = step_base_lr
+        self.lrs = lrs
+        self.gamma = gamma
+        self.steps = steps
+        self.max_epoch = max_epoch
+        self.num_iters = num_iters
+        self.last_epoch = last_epoch
+        self.last_lr = self.warmup_start_lr  # used in first iter
+        self.verbose = verbose
+        self._var_name = None
+
+    def step(self, epoch=None, rebuild=False):
+        """
+        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
+        The new learning rate will take effect on next ``optimizer.step`` .
+        Args:
+            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
+        Returns:
+            None
+        """
+        if epoch is None:
+            if not rebuild:
+                self.last_epoch += 1 / self.num_iters  # update step with iters
+        else:
+            self.last_epoch = epoch
+        self.last_lr = self.get_lr()
+
+        if self.verbose:
+            print(
+                'step Epoch {}: {} set learning rate to {}.self.num_iters={}, 1/self.num_iters={}'
+                .format(self.last_epoch, self.__class__.__name__, self.last_lr,
+                        self.num_iters, 1 / self.num_iters))
+
+    def _lr_func_steps_with_relative_lrs(self, cur_epoch, lrs, base_lr, steps,
+                                         max_epoch):
+        # get step index
+        steps = steps + [max_epoch]
+        for ind, step in enumerate(steps):
+            if cur_epoch < step:
+                break
+        if self.verbose:
+            print(
+                '_lr_func_steps_with_relative_lrs, cur_epoch {}: {}, steps {}, ind {}, step{}, max_epoch{}'
+                .format(cur_epoch, self.__class__.__name__, steps, ind, step,
+                        max_epoch))
+
+        return lrs[ind - 1] * base_lr
+
+    def get_lr(self):
+        """Define lr policy"""
+        lr = self._lr_func_steps_with_relative_lrs(
+            self.last_epoch,
+            self.lrs,
+            self.step_base_lr,
+            self.steps,
+            self.max_epoch,
+        )
+        lr_end = self._lr_func_steps_with_relative_lrs(
+            self.warmup_epochs,
+            self.lrs,
+            self.step_base_lr,
+            self.steps,
+            self.max_epoch,
+        )
+
+        # Perform warm up.
+        if self.last_epoch < self.warmup_epochs:
+            lr_start = self.warmup_start_lr
+            alpha = (lr_end - lr_start) / self.warmup_epochs
+            lr = self.last_epoch * alpha + lr_start
+        if self.verbose:
+            print(
+                'get_lr, Epoch {}: {}, lr {}, lr_end {}, self.lrs{}, self.step_base_lr{}, self.steps{}, self.max_epoch{}'
+                .format(self.last_epoch, self.__class__.__name__, lr, lr_end,
+                        self.lrs, self.step_base_lr, self.steps,
+                        self.max_epoch))
+
+        return lr
+
+
+class CustomPiecewiseDecay(PiecewiseDecay):
+
+    def __init__(self, **kargs):
+        kargs.pop('num_iters')
+        super().__init__(**kargs)
+
+
+class CustomWarmupCosineStepDecay(LRScheduler):
+
+    def __init__(self,
+                 warmup_iters,
+                 warmup_ratio=0.1,
+                 min_lr=0,
+                 base_lr=3e-5,
+                 max_epoch=30,
+                 last_epoch=-1,
+                 num_iters=None,
+                 verbose=False):
+
+        self.warmup_ratio = warmup_ratio
+        self.min_lr = min_lr
+        self.warmup_epochs = warmup_iters
+        self.warmup_iters = warmup_iters * num_iters
+        self.cnt_iters = 0
+        self.cnt_epoch = 0
+        self.num_iters = num_iters
+        self.tot_iters = max_epoch * num_iters
+        self.max_epoch = max_epoch
+        self.cosine_base_lr = base_lr  # initial lr for all param groups
+        self.regular_lr = self.get_regular_lr()
+        super().__init__(last_epoch=last_epoch, verbose=verbose)
+
+    def annealing_cos(self, start, end, factor, weight=1):
+        cos_out = math.cos(math.pi * factor) + 1
+        return end + 0.5 * weight * (start - end) * cos_out
+
+    def get_regular_lr(self):
+        progress = self.cnt_epoch
+        max_progress = self.max_epoch
+        target_lr = self.min_lr
+        return self.annealing_cos(self.cosine_base_lr, target_lr, progress /
+                                  max_progress)  # self.cosine_base_lr
+
+    def get_warmup_lr(self, cur_iters):
+        k = (1 - cur_iters / self.warmup_iters) * (1 - self.warmup_ratio)
+        warmup_lr = self.regular_lr * (1 - k)  # 3e-5 * (1-k)
+        return warmup_lr
+
+    def step(self, epoch=None):
+        self.regular_lr = self.get_regular_lr()
+        self.last_lr = self.get_lr()
+        self.cnt_epoch = (self.cnt_iters +
+                          1) // self.num_iters  # update step with iters
+        self.cnt_iters += 1
+
+        if self.verbose:
+            print('Epoch {}: {} set learning rate to {}.'.format(
+                self.last_epoch, self.__class__.__name__, self.last_lr))
+
+    def get_lr(self):
+        """Define lr policy"""
+        cur_iter = self.cnt_iters
+        if cur_iter >= self.warmup_iters:
+            return self.regular_lr
+        else:
+            warmup_lr = self.get_warmup_lr(cur_iter)
+            return warmup_lr
+
+
+class CustomWarmupAdjustDecay(LRScheduler):
+    r"""
+    We combine warmup and stepwise-cosine which is used in slowfast model.
+
+    Args:
+        step_base_lr (float): start learning rate used in warmup stage.
+        warmup_epochs (int): the number epochs of warmup.
+        lr_decay_rate (float|int, optional): base learning rate decay rate.
+        step (int): step in change learning rate.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+    Returns:
+        ``CosineAnnealingDecay`` instance to schedule learning rate.
+    """
+
+    def __init__(self,
+                 step_base_lr,
+                 warmup_epochs,
+                 lr_decay_rate,
+                 boundaries,
+                 num_iters=None,
+                 last_epoch=-1,
+                 verbose=False):
+        self.step_base_lr = step_base_lr
+        self.warmup_epochs = warmup_epochs
+        self.lr_decay_rate = lr_decay_rate
+        self.boundaries = boundaries
+        self.num_iters = num_iters
+        #call step() in base class, last_lr/last_epoch/base_lr will be update
+        super(CustomWarmupAdjustDecay, self).__init__(last_epoch=last_epoch,
+                                                      verbose=verbose)
+
+    def step(self, epoch=None):
+        """
+        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
+        The new learning rate will take effect on next ``optimizer.step`` .
+        Args:
+            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
+        Returns:
+            None
+        """
+        if epoch is None:
+            if self.last_epoch == -1:
+                self.last_epoch += 1
+            else:
+                self.last_epoch += 1 / self.num_iters  # update step with iters
+        else:
+            self.last_epoch = epoch
+
+        self.last_lr = self.get_lr()
+
+        if self.verbose:
+            print('Epoch {}: {} set learning rate to {}.'.format(
+                self.last_epoch, self.__class__.__name__, self.last_lr))
+
+    def get_lr(self):
+        if self.last_epoch < self.warmup_epochs:
+            lr = self.step_base_lr * (self.last_epoch + 1) / self.warmup_epochs
+        else:
+            lr = self.step_base_lr * (self.lr_decay_rate**np.sum(
+                self.last_epoch >= np.array(self.boundaries)))
+        return lr
diff --git a/Bank_second_part/detect_process/paddlevideo/solver/lr.py b/Bank_second_part/detect_process/paddlevideo/solver/lr.py
new file mode 100644
index 0000000..3a56fad
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/solver/lr.py
@@ -0,0 +1,52 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+
+from paddle.optimizer.lr import LRScheduler
+
+from . import custom_lr
+
+
+def build_lr(cfg: Dict, num_iters: int) -> LRScheduler:
+    """Build a learning rate scheduler accroding to ```OPTIMIZER``` configuration, and it always pass into the optimizer.
+    In configuration:
+    learning_rate:
+        name: 'PiecewiseDecay'
+        boundaries: [20, 60]
+        values: [0.00025, 0.000025, 0.0000025]
+
+    Args:
+        cfg (Dict): learning rate configuration.
+        num_iters (int): The number of iterations that may be used when calculating the learning rate
+
+    Returns:
+        LRScheduler: learning rate scheduler.
+    """
+
+    cfg_copy = cfg.copy()
+
+    #when learning_rate is LRScheduler
+    if cfg_copy.get('learning_rate') and isinstance(cfg_copy['learning_rate'],
+                                                    dict):
+        cfg_copy['learning_rate'] = build_lr(
+            cfg_copy['learning_rate'],
+            num_iters)  #not support only inner iter_step
+
+    lr_name = cfg_copy.pop('name')
+    if cfg_copy.get('iter_step'):
+        cfg_copy['num_iters'] = num_iters
+        cfg_copy.pop('iter_step')
+
+    return getattr(custom_lr, lr_name)(**cfg_copy)
diff --git a/Bank_second_part/detect_process/paddlevideo/solver/optimizer.py b/Bank_second_part/detect_process/paddlevideo/solver/optimizer.py
new file mode 100644
index 0000000..46ff916
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/solver/optimizer.py
@@ -0,0 +1,132 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Dict
+
+import paddle
+from paddle.optimizer.lr import LRScheduler
+from paddle.regularizer import L1Decay, L2Decay
+from paddlevideo.utils import get_logger
+
+
+def build_optimizer(cfg: Dict,
+                    lr_scheduler: LRScheduler,
+                    model: paddle.nn.Layer,
+                    use_amp: bool = False,
+                    amp_level: str = None) -> paddle.optimizer.Optimizer:
+    """Build an optimizer and learning rate scheduler to optimize parameters accroding to ```OPTIMIZER``` field in configuration.
+
+    In configuration:
+    OPTIMIZER:
+        name: Momentum
+        momentum: 0.9
+        weight_decay: 0.001
+    or
+
+    OPTIMIZER:
+        name: Momentum
+        momentum: 0.9
+        weight_decay:
+            name: "L1"
+            value: 0.001
+
+    Momentum optimizer will be applied to optimize network and L1Decay regularizer will be applied to avoid overfit.
+
+    OPTIMIZER:
+        name: Adam
+        weight_decay:
+            name: "L2"
+            value: 0.001
+
+    Adam optimizer will be applied to optimize network and L2Decay regularizer will applied to avoid overfit.
+
+    Refer to ```https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/regularizer/L2Decay_en.html``` for more details.
+
+    Args:
+        cfg (Dict): optimizer configuration.
+        lr_scheduler (LRScheduler): learning rate scheduler.
+        model (paddle.nn.Layer, optional): model which contains parameters to be optimized. Defaults to None.
+        use_amp (bool, optional): Whether use amp. Defaults to False.
+        amp_level (str, optional): amp level when amp is enabled. Defaults to None.
+
+
+    Returns:
+        paddle.optimizer.Optimizer: an optimizer for the input model.
+    """
+    logger = get_logger("paddlevideo")
+    cfg_copy = cfg.copy()
+    # NOTE: check none and illegal cfg!!!
+    opt_name = cfg_copy.pop('name')
+    # deal with weight decay
+    if cfg_copy.get('weight_decay'):
+        if isinstance(cfg_copy.get('weight_decay'),
+                      float):  # just an float factor
+            cfg_copy['weight_decay'] = cfg_copy.get('weight_decay')
+        elif 'L1' in cfg_copy.get('weight_decay').get(
+                'name').upper():  # specify L2 wd and it's float factor
+            cfg_copy['weight_decay'] = L1Decay(
+                cfg_copy.get('weight_decay').get('value'))
+        elif 'L2' in cfg_copy.get('weight_decay').get(
+                'name').upper():  # specify L1 wd and it's float factor
+            cfg_copy['weight_decay'] = L2Decay(
+                cfg_copy.get('weight_decay').get('value'))
+        else:
+            raise ValueError
+
+    # deal with grad clip
+    if cfg_copy.get('grad_clip'):
+        if isinstance(cfg_copy.get('grad_clip'), float):
+            cfg_copy['grad_clip'] = cfg_copy.get('grad_clip').get('value')
+        elif 'global' in cfg_copy.get('grad_clip').get('name').lower():
+            cfg_copy['grad_clip'] = paddle.nn.ClipGradByGlobalNorm(
+                cfg_copy.get('grad_clip').get('value'))
+        else:
+            raise ValueError
+
+    # Set for optimizers that cannot be applied to l2decay, i.e. AdamW
+    if cfg_copy.get('no_weight_decay_name'):
+        no_weight_decay_name = cfg_copy.pop('no_weight_decay_name')
+        no_weight_decay_name_list = no_weight_decay_name.split(' ')
+
+        # NOTE: use param.name not name
+        no_weight_decay_param_list = [
+            param.name for name, param in model.named_parameters()
+            if any(key_word in name for key_word in no_weight_decay_name_list)
+        ]  # get the full param name of no weight decay
+
+        _apply_decay_param_fun = lambda name: name not in no_weight_decay_param_list
+        cfg_copy['apply_decay_param_fun'] = _apply_decay_param_fun
+        logger.info(
+            f"No weight Decay list :({len(no_weight_decay_param_list)})",
+            no_weight_decay_param_list)
+
+    cfg_copy.pop('learning_rate')
+
+    # set multi_precision
+    optimizer_setting = {
+        'learning_rate': lr_scheduler,
+        'parameters': model.parameters(),
+        **cfg_copy
+    }
+    optimizer_init_args = inspect.getargspec(
+        getattr(paddle.optimizer, opt_name).__init__).args
+    if use_amp and amp_level == "O2" and "multi_precision" in optimizer_init_args:
+        # support "multi_precision" arg in optimizer's __init__ function.
+        optimizer_setting.update({"multi_precision": True})
+        logger.info(
+            "Set multi_precision=True for optimizer when use_amp=True and amp_level='O2'"
+        )
+
+    return getattr(paddle.optimizer, opt_name)(**optimizer_setting)
diff --git a/Bank_second_part/detect_process/paddlevideo/tasks/__init__.py b/Bank_second_part/detect_process/paddlevideo/tasks/__init__.py
new file mode 100644
index 0000000..4d43f09
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/tasks/__init__.py
@@ -0,0 +1,20 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .train import train_model
+from .test import test_model
+from .train_dali import train_dali
+from .train_multigrid import train_model_multigrid
+
+__all__ = ['train_model', 'test_model', 'train_dali', 'train_model_multigrid']
diff --git a/Bank_second_part/detect_process/paddlevideo/tasks/test.py b/Bank_second_part/detect_process/paddlevideo/tasks/test.py
new file mode 100644
index 0000000..31c8653
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/tasks/test.py
@@ -0,0 +1,90 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddlevideo.utils import get_logger, load
+
+from ..loader.builder import build_dataloader, build_dataset
+from ..metrics import build_metric
+from ..modeling.builder import build_model
+
+logger = get_logger("paddlevideo")
+
+
+@paddle.no_grad()
+def test_model(cfg, weights, parallel=True):
+    """Test model entry
+
+    Args:
+        cfg (dict): configuration.
+        weights (str): weights path to load.
+        parallel (bool): Whether to do multi-cards testing. Default: True.
+
+    """
+
+    if cfg.get('use_npu', False):
+        places = paddle.set_device('npu')
+    elif cfg.get('use_xpu', False):
+        places = paddle.set_device('xpu')
+    else:
+        places = paddle.set_device('gpu')
+
+    # 1. Construct model.
+    if cfg.MODEL.get('backbone') and cfg.MODEL.backbone.get('pretrained'):
+        cfg.MODEL.backbone.pretrained = ''  # disable pretrain model init
+    model = build_model(cfg.MODEL)
+
+    if parallel:
+        model = paddle.DataParallel(model)
+
+    # 2. Construct dataset and dataloader.
+    cfg.DATASET.test.test_mode = True
+    dataset = build_dataset((cfg.DATASET.test, cfg.PIPELINE.test))
+    batch_size = cfg.DATASET.get("test_batch_size", 8)
+
+    # default num worker: 0, which means no subprocess will be created
+    num_workers = cfg.DATASET.get('num_workers', 0)
+    num_workers = cfg.DATASET.get('test_num_workers', num_workers)
+    dataloader_setting = dict(batch_size=batch_size,
+                              num_workers=num_workers,
+                              places=places,
+                              drop_last=False,
+                              shuffle=False)
+
+    data_loader = build_dataloader(
+        dataset, **dataloader_setting) if cfg.model_name not in ['CFBI'
+                                                                 ] else dataset
+
+    model.eval()
+
+    state_dicts = load(weights)
+    model.set_state_dict(state_dicts)
+
+    # add params to metrics
+    cfg.METRIC.data_size = len(dataset)
+    cfg.METRIC.batch_size = batch_size
+    Metric = build_metric(cfg.METRIC)
+
+    if cfg.MODEL.framework == "FastRCNN":
+        Metric.set_dataset_info(dataset.info, len(dataset))
+
+    for batch_id, data in enumerate(data_loader):
+        if cfg.model_name in [
+                'CFBI'
+        ]:  # for VOS task, dataset for video and dataloader for frames in each video
+            Metric.update(batch_id, data, model)
+        else:
+            outputs = model(data, mode='test')
+            Metric.update(batch_id, data, outputs)
+    Metric.accumulate()
diff --git a/Bank_second_part/detect_process/paddlevideo/tasks/train.py b/Bank_second_part/detect_process/paddlevideo/tasks/train.py
new file mode 100644
index 0000000..451ec5d
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/tasks/train.py
@@ -0,0 +1,426 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import time
+
+import paddle
+import paddle.amp as amp
+import paddle.distributed as dist
+import paddle.distributed.fleet as fleet
+from paddlevideo.utils import (add_profiler_step, build_record, get_logger,
+                               load, log_batch, log_epoch, mkdir, save)
+
+from ..loader.builder import build_dataloader, build_dataset
+from ..metrics.ava_utils import collect_results_cpu
+from ..modeling.builder import build_model
+from ..solver import build_lr, build_optimizer
+from ..utils import do_preciseBN
+
+
+def train_model(cfg,
+                weights=None,
+                parallel=True,
+                validate=True,
+                use_amp=False,
+                amp_level=None,
+                max_iters=None,
+                use_fleet=False,
+                profiler_options=None):
+    """Train model entry
+
+    Args:
+        cfg (dict): configuration.
+        weights (str, optional): weights path for finetuning. Defaults to None.
+        parallel (bool, optional): whether multi-cards training. Defaults to True.
+        validate (bool, optional): whether to do evaluation. Defaults to True.
+        use_amp (bool, optional): whether to use automatic mixed precision during training. Defaults to False.
+        amp_level (str, optional): amp optmization level, must be 'O1' or 'O2' when use_amp is True. Defaults to None.
+        max_iters (int, optional): max running iters in an epoch. Defaults to None.
+        use_fleet (bool, optional): whether to use fleet. Defaults to False.
+        profiler_options (str, optional): configuration for the profiler function. Defaults to None.
+
+    """
+    if use_fleet:
+        fleet.init(is_collective=True)
+
+    logger = get_logger("paddlevideo")
+    batch_size = cfg.DATASET.get('batch_size', 8)
+    valid_batch_size = cfg.DATASET.get('valid_batch_size', batch_size)
+
+    # gradient accumulation settings
+    use_gradient_accumulation = cfg.get('GRADIENT_ACCUMULATION', None)
+    if use_gradient_accumulation and dist.get_world_size() >= 1:
+        global_batch_size = cfg.GRADIENT_ACCUMULATION.get(
+            'global_batch_size', None)
+        num_gpus = dist.get_world_size()
+
+        assert isinstance(
+            global_batch_size, int
+        ), f"global_batch_size must be int, but got {type(global_batch_size)}"
+        assert batch_size <= global_batch_size, \
+            f"global_batch_size({global_batch_size}) must not be less than batch_size({batch_size})"
+
+        cur_global_batch_size = batch_size * num_gpus  # The number of batches calculated by all GPUs at one time
+        assert global_batch_size % cur_global_batch_size == 0, \
+            f"The global batchsize({global_batch_size}) must be divisible by cur_global_batch_size({cur_global_batch_size})"
+        cfg.GRADIENT_ACCUMULATION[
+            "num_iters"] = global_batch_size // cur_global_batch_size
+        # The number of iterations required to reach the global batchsize
+        logger.info(
+            f"Using gradient accumulation training strategy, "
+            f"global_batch_size={global_batch_size}, "
+            f"num_gpus={num_gpus}, "
+            f"num_accumulative_iters={cfg.GRADIENT_ACCUMULATION.num_iters}")
+
+    if cfg.get('use_npu', False):
+        places = paddle.set_device('npu')
+    elif cfg.get('use_xpu', False):
+        places = paddle.set_device('xpu')
+    else:
+        places = paddle.set_device('gpu')
+
+    # default num worker: 0, which means no subprocess will be created
+    num_workers = cfg.DATASET.get('num_workers', 0)
+    valid_num_workers = cfg.DATASET.get('valid_num_workers', num_workers)
+    model_name = cfg.model_name
+    output_dir = cfg.get("output_dir", f"./output/{model_name}")
+    mkdir(output_dir)
+
+    # 1. Construct model
+    model = build_model(cfg.MODEL)
+
+    if cfg.get('to_static', False):
+        specs = None
+        model = paddle.jit.to_static(model, input_spec=specs)
+        logger.info(
+            "Successfully to apply @to_static with specs: {}".format(specs))
+
+    # 2. Construct dataset and dataloader for training and evaluation
+    train_dataset = build_dataset((cfg.DATASET.train, cfg.PIPELINE.train))
+    train_dataloader_setting = dict(
+        batch_size=batch_size,
+        num_workers=num_workers,
+        collate_fn_cfg=cfg.get('MIX', None),
+        places=places)
+    train_loader = build_dataloader(train_dataset, **train_dataloader_setting)
+
+    if validate:
+        valid_dataset = build_dataset((cfg.DATASET.valid, cfg.PIPELINE.valid))
+        validate_dataloader_setting = dict(
+            batch_size=valid_batch_size,
+            num_workers=valid_num_workers,
+            places=places,
+            drop_last=False,
+            shuffle=cfg.DATASET.get(
+                'shuffle_valid',
+                False)  # NOTE: attention_LSTM needs to shuffle valid data.
+        )
+        valid_loader = build_dataloader(valid_dataset,
+                                        **validate_dataloader_setting)
+
+    # 3. Construct learning rate scheduler(lr) and optimizer
+    lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader))
+    optimizer = build_optimizer(
+        cfg.OPTIMIZER, lr, model=model, use_amp=use_amp, amp_level=amp_level)
+
+    # 4. Construct scalar and convert parameters for amp(optional)
+    if use_amp:
+        scaler = amp.GradScaler(
+            init_loss_scaling=2.0**16,
+            incr_every_n_steps=2000,
+            decr_every_n_nan_or_inf=1)
+        # convert model parameters to fp16 when amp_level is O2(pure fp16)
+        model, optimizer = amp.decorate(
+            models=model,
+            optimizers=optimizer,
+            level=amp_level,
+            master_weight=True,
+            save_dtype=None)
+        # NOTE: save_dtype is set to float32 now.
+        logger.info(f"Training in amp mode, amp_level={amp_level}.")
+    else:
+        assert amp_level is None, f"amp_level must be None when training in fp32 mode, but got {amp_level}."
+        logger.info("Training in fp32 mode.")
+
+    # 5. Resume(optional)
+    resume_epoch = cfg.get("resume_epoch", 0)
+    if resume_epoch:
+        filename = osp.join(output_dir,
+                            model_name + f"_epoch_{resume_epoch:05d}")
+        resume_model_dict = load(filename + '.pdparams')
+        resume_opt_dict = load(filename + '.pdopt')
+        model.set_state_dict(resume_model_dict)
+        optimizer.set_state_dict(resume_opt_dict)
+        logger.info("Resume from checkpoint: {}".format(filename))
+
+    # 6. Finetune(optional)
+    if weights:
+        assert resume_epoch == 0, f"Conflict occurs when finetuning, please switch resume function off by setting resume_epoch to 0 or not indicating it."
+        model_dict = load(weights)
+        model.set_state_dict(model_dict)
+        logger.info("Finetune from checkpoint: {}".format(weights))
+
+    # 7. Parallelize(optional)
+    if parallel:
+        model = paddle.DataParallel(model)
+
+    if use_fleet:
+        model = fleet.distributed_model(model)
+        optimizer = fleet.distributed_optimizer(optimizer)
+
+    # 8. Train Model
+    best = 0.0
+    for epoch in range(0, cfg.epochs):
+        if epoch < resume_epoch:
+            logger.info(
+                f"| epoch: [{epoch + 1}] <= resume_epoch: [{resume_epoch}], continue..."
+            )
+            continue
+        model.train()
+
+        record_list = build_record(cfg.MODEL)
+        tic = time.time()
+        for i, data in enumerate(train_loader):
+            """Next two line of code only used in test_tipc,
+            ignore it most of the time"""
+            if max_iters is not None and i >= max_iters:
+                break
+
+            record_list['reader_time'].update(time.time() - tic)
+
+            # Collect performance information when profiler_options is activate
+            add_profiler_step(profiler_options)
+
+            # 8.1 forward
+            # AMP #
+            if use_amp:
+                with amp.auto_cast(
+                        custom_black_list={"reduce_mean", "conv3d"},
+                        level=amp_level):
+                    outputs = model(data, mode='train')
+                avg_loss = outputs['loss']
+                if use_gradient_accumulation:
+                    # clear grad at when epoch begins
+                    if i == 0:
+                        optimizer.clear_grad()
+                    # Loss normalization
+                    avg_loss /= cfg.GRADIENT_ACCUMULATION.num_iters
+                    # Loss scaling
+                    scaled = scaler.scale(avg_loss)
+                    # 8.2 backward
+                    scaled.backward()
+                    # 8.3 minimize
+                    if (i + 1) % cfg.GRADIENT_ACCUMULATION.num_iters == 0:
+                        scaler.minimize(optimizer, scaled)
+                        optimizer.clear_grad()
+                else:  # general case
+                    # Loss scaling
+                    scaled = scaler.scale(avg_loss)
+                    # 8.2 backward
+                    scaled.backward()
+                    # 8.3 minimize
+                    scaler.minimize(optimizer, scaled)
+                    optimizer.clear_grad()
+            else:
+                outputs = model(data, mode='train')
+                avg_loss = outputs['loss']
+                if use_gradient_accumulation:
+                    # clear grad at when epoch begins
+                    if i == 0:
+                        optimizer.clear_grad()
+                    # Loss normalization
+                    avg_loss /= cfg.GRADIENT_ACCUMULATION.num_iters
+                    # 8.2 backward
+                    avg_loss.backward()
+                    # 8.3 minimize
+                    if (i + 1) % cfg.GRADIENT_ACCUMULATION.num_iters == 0:
+                        optimizer.step()
+                        optimizer.clear_grad()
+                else:  # general case
+                    # 8.2 backward
+                    avg_loss.backward()
+                    # 8.3 minimize
+                    optimizer.step()
+                    optimizer.clear_grad()
+
+            # log record
+            record_list['lr'].update(optimizer.get_lr(), batch_size)
+            for name, value in outputs.items():
+                if name in record_list:
+                    record_list[name].update(value, batch_size)
+
+            record_list['batch_time'].update(time.time() - tic)
+            tic = time.time()
+
+            if i % cfg.get("log_interval", 10) == 0:
+                ips = "ips: {:.5f} instance/sec,".format(
+                    batch_size / record_list["batch_time"].val)
+                cur_progress = ((i + 1) + epoch * len(train_loader)) / (
+                    len(train_loader) * cfg.epochs)
+                eta = int(record_list["batch_time"].sum *
+                          (1 - cur_progress) / cur_progress + 0.5)
+                log_batch(record_list, i, epoch + 1, cfg.epochs, "train", ips,
+                          eta)
+
+            # learning rate iter step
+            if cfg.OPTIMIZER.learning_rate.get("iter_step"):
+                lr.step()
+
+        # learning rate epoch step
+        if not cfg.OPTIMIZER.learning_rate.get("iter_step"):
+            lr.step()
+
+        ips = "avg_ips: {:.5f} instance/sec.".format(
+            batch_size * record_list["batch_time"].count /
+            record_list["batch_time"].sum)
+        log_epoch(record_list, epoch + 1, "train", ips)
+
+        def evaluate(best):
+            model.eval()
+            results = []
+            record_list = build_record(cfg.MODEL)
+            record_list.pop('lr')
+            tic = time.time()
+            if parallel:
+                rank = dist.get_rank()
+            # single_gpu_test and multi_gpu_test
+            for i, data in enumerate(valid_loader):
+                """Next two line of code only used in test_tipc,
+                ignore it most of the time"""
+                if max_iters is not None and i >= max_iters:
+                    break
+
+                if use_amp:
+                    with amp.auto_cast(
+                            custom_black_list={"reduce_mean", "conv3d"},
+                            level=amp_level):
+                        outputs = model(data, mode='valid')
+                else:
+                    outputs = model(data, mode='valid')
+
+                if cfg.MODEL.framework == "FastRCNN":
+                    results.extend(outputs)
+
+                # log_record
+                if cfg.MODEL.framework != "FastRCNN":
+                    for name, value in outputs.items():
+                        if name in record_list:
+                            record_list[name].update(value, batch_size)
+
+                record_list['batch_time'].update(time.time() - tic)
+                tic = time.time()
+
+                if i % cfg.get("log_interval", 10) == 0:
+                    ips = "ips: {:.5f} instance/sec.".format(
+                        valid_batch_size / record_list["batch_time"].val)
+                    log_batch(record_list, i, epoch + 1, cfg.epochs, "val", ips)
+
+            if cfg.MODEL.framework == "FastRCNN":
+                if parallel:
+                    results = collect_results_cpu(results, len(valid_dataset))
+                if not parallel or (parallel and rank == 0):
+                    eval_res = valid_dataset.evaluate(results)
+                    for name, value in eval_res.items():
+                        record_list[name].update(value, valid_batch_size)
+
+            ips = "avg_ips: {:.5f} instance/sec.".format(
+                valid_batch_size * record_list["batch_time"].count /
+                record_list["batch_time"].sum)
+            log_epoch(record_list, epoch + 1, "val", ips)
+
+            best_flag = False
+            if cfg.MODEL.framework == "FastRCNN" and (not parallel or
+                                                      (parallel and rank == 0)):
+                if record_list["mAP@0.5IOU"].val > best:
+                    best = record_list["mAP@0.5IOU"].val
+                    best_flag = True
+                return best, best_flag
+
+            if cfg.MODEL.framework == "YOWOLocalizer" and (not parallel or
+                                                           (parallel and rank == 0)):
+                if record_list["fscore"].avg > best:
+                    best = record_list["fscore"].avg
+                    best_flag = True
+                return best, best_flag
+
+            # forbest2, cfg.MODEL.framework != "FastRCNN":
+            for top_flag in ['hit_at_one', 'top1', 'rmse', "F1@0.50"]:
+                if record_list.get(top_flag):
+                    if top_flag != 'rmse' and record_list[top_flag].avg > best:
+                        best = record_list[top_flag].avg
+                        best_flag = True
+                    elif top_flag == 'rmse' and (
+                            best == 0.0 or record_list[top_flag].avg < best):
+                        best = record_list[top_flag].avg
+                        best_flag = True
+
+            return best, best_flag
+
+        # use precise bn to improve acc
+        if cfg.get("PRECISEBN") and (
+                epoch % cfg.PRECISEBN.preciseBN_interval == 0
+                or epoch == cfg.epochs - 1):
+            do_preciseBN(model, train_loader, parallel,
+                         min(cfg.PRECISEBN.num_iters_preciseBN,
+                             len(train_loader)), use_amp, amp_level)
+
+        # 9. Validation
+        if validate and (epoch % cfg.get("val_interval", 1) == 0
+                         or epoch == cfg.epochs - 1):
+            with paddle.no_grad():
+                best, save_best_flag = evaluate(best)
+            # save best
+            if save_best_flag:
+                save(optimizer.state_dict(),
+                     osp.join(output_dir, model_name + "_best.pdopt"))
+                save_student_model_flag = True if "Distillation" in cfg.MODEL.framework else False
+                save(
+                    model.state_dict(),
+                    osp.join(output_dir, model_name + "_best.pdparams"),
+                    save_student_model=save_student_model_flag)
+                if model_name == "AttentionLstm":
+                    logger.info(
+                        f"Already save the best model (hit_at_one){best}")
+                elif cfg.MODEL.framework == "FastRCNN":
+                    logger.info(
+                        f"Already save the best model (mAP@0.5IOU){int(best * 10000) / 10000}"
+                    )
+                elif cfg.MODEL.framework == "DepthEstimator":
+                    logger.info(
+                        f"Already save the best model (rmse){int(best * 10000) / 10000}"
+                    )
+                elif cfg.MODEL.framework in ['MSTCN', 'ASRF']:
+                    logger.info(
+                        f"Already save the best model (F1@0.50){int(best * 10000) / 10000}"
+                    )
+                elif cfg.MODEL.framework in ['YOWOLocalizer']:
+                    logger.info(
+                        f"Already save the best model (fsocre){int(best * 10000) / 10000}"
+                    )
+                else:
+                    logger.info(
+                        f"Already save the best model (top1 acc){int(best * 10000) / 10000}"
+                    )
+
+        # 10. Save model and optimizer
+        if epoch % cfg.get("save_interval", 1) == 0 or epoch == cfg.epochs - 1:
+            save(optimizer.state_dict(),
+                 osp.join(output_dir,
+                          model_name + f"_epoch_{epoch + 1:05d}.pdopt"))
+            save(model.state_dict(),
+                 osp.join(output_dir,
+                          model_name + f"_epoch_{epoch + 1:05d}.pdparams"))
+
+    logger.info(f'training {model_name} finished')
diff --git a/Bank_second_part/detect_process/paddlevideo/tasks/train_dali.py b/Bank_second_part/detect_process/paddlevideo/tasks/train_dali.py
new file mode 100644
index 0000000..8dd0a20
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/tasks/train_dali.py
@@ -0,0 +1,143 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import os.path as osp
+
+import paddle
+from ..modeling.builder import build_model
+from ..solver import build_lr, build_optimizer
+from ..utils import do_preciseBN
+from paddlevideo.utils import get_logger, coloring
+from paddlevideo.utils import (AverageMeter, build_record, log_batch, log_epoch,
+                               save, load, mkdir)
+from paddlevideo.loader import TSN_Dali_loader, get_input_data
+"""
+We only supported DALI training for TSN model now.
+"""
+
+
+def train_dali(cfg, weights=None, parallel=True):
+    """Train model entry
+
+    Args:
+    	cfg (dict): configuration.
+        weights (str): weights path for finetuning.
+    	parallel (bool): Whether multi-cards training. Default: True.
+
+    """
+
+    logger = get_logger("paddlevideo")
+    batch_size = cfg.DALI_LOADER.get('batch_size', 8)
+    places = paddle.set_device('gpu')
+    model_name = cfg.model_name
+    output_dir = cfg.get("output_dir", f"./output/{model_name}")
+    mkdir(output_dir)
+
+    # 1. Construct model
+    model = build_model(cfg.MODEL)
+    if parallel:
+        model = paddle.DataParallel(model)
+
+    # 2. Construct dali dataloader
+    train_loader = TSN_Dali_loader(cfg.DALI_LOADER).build_dali_reader()
+
+    # 3. Construct solver.
+    lr = build_lr(cfg.OPTIMIZER.learning_rate, None)
+    optimizer = build_optimizer(cfg.OPTIMIZER, lr, model=model)
+
+    # Resume
+    resume_epoch = cfg.get("resume_epoch", 0)
+    if resume_epoch:
+        filename = osp.join(output_dir,
+                            model_name + f"_epoch_{resume_epoch:05d}")
+        resume_model_dict = load(filename + '.pdparams')
+        resume_opt_dict = load(filename + '.pdopt')
+        model.set_state_dict(resume_model_dict)
+        optimizer.set_state_dict(resume_opt_dict)
+
+    # Finetune:
+    if weights:
+        assert resume_epoch == 0, f"Conflict occurs when finetuning, please switch resume function off by setting resume_epoch to 0 or not indicating it."
+        model_dict = load(weights)
+        model.set_state_dict(model_dict)
+
+    # 4. Train Model
+    for epoch in range(0, cfg.epochs):
+        if epoch < resume_epoch:
+            logger.info(
+                f"| epoch: [{epoch+1}] <= resume_epoch: [{ resume_epoch}], continue... "
+            )
+            continue
+        model.train()
+        record_list = build_record(cfg.MODEL)
+        tic = time.time()
+        for i, data in enumerate(train_loader):
+            data = get_input_data(data)
+            record_list['reader_time'].update(time.time() - tic)
+            # 4.1 forward
+            outputs = model(data, mode='train')
+            # 4.2 backward
+            avg_loss = outputs['loss']
+            avg_loss.backward()
+            # 4.3 minimize
+            optimizer.step()
+            optimizer.clear_grad()
+
+            # log record
+            record_list['lr'].update(optimizer._global_learning_rate(),
+                                     batch_size)
+            for name, value in outputs.items():
+                record_list[name].update(value, batch_size)
+
+            record_list['batch_time'].update(time.time() - tic)
+            tic = time.time()
+
+            if i % cfg.get("log_interval", 10) == 0:
+                ips = "ips: {:.5f} instance/sec.".format(
+                    batch_size / record_list["batch_time"].val)
+                log_batch(record_list, i, epoch + 1, cfg.epochs, "train", ips)
+
+            # learning rate iter step
+            if cfg.OPTIMIZER.learning_rate.get("iter_step"):
+                lr.step()
+
+        # learning rate epoch step
+        if not cfg.OPTIMIZER.learning_rate.get("iter_step"):
+            lr.step()
+
+        ips = "ips: {:.5f} instance/sec.".format(
+            batch_size * record_list["batch_time"].count /
+            record_list["batch_time"].sum)
+        log_epoch(record_list, epoch + 1, "train", ips)
+
+        # use precise bn to improve acc
+        if cfg.get("PRECISEBN") and (epoch % cfg.PRECISEBN.preciseBN_interval
+                                     == 0 or epoch == cfg.epochs - 1):
+            do_preciseBN(
+                model, train_loader, parallel,
+                min(cfg.PRECISEBN.num_iters_preciseBN, len(train_loader)))
+
+        # 5. Save model and optimizer
+        if epoch % cfg.get("save_interval", 1) == 0 or epoch == cfg.epochs - 1:
+            save(
+                optimizer.state_dict(),
+                osp.join(output_dir,
+                         model_name + f"_epoch_{epoch+1:05d}.pdopt"))
+            save(
+                model.state_dict(),
+                osp.join(output_dir,
+                         model_name + f"_epoch_{epoch+1:05d}.pdparams"))
+
+    logger.info(f'training {model_name} finished')
diff --git a/Bank_second_part/detect_process/paddlevideo/tasks/train_multigrid.py b/Bank_second_part/detect_process/paddlevideo/tasks/train_multigrid.py
new file mode 100644
index 0000000..19e756f
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/tasks/train_multigrid.py
@@ -0,0 +1,335 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import os.path as osp
+
+import paddle
+import paddle.distributed as dist
+
+from ..loader.builder import build_dataloader, build_dataset
+from ..modeling.builder import build_model
+from ..solver import build_lr, build_optimizer
+from ..utils import do_preciseBN
+from paddlevideo.utils import get_logger, coloring
+from paddlevideo.utils import (AverageMeter, build_record, log_batch, log_epoch,
+                               save, load, mkdir)
+from paddlevideo.utils.multigrid import MultigridSchedule, aggregate_sub_bn_stats, subn_load, subn_save, is_eval_epoch
+
+
+def construct_loader(cfg, places, validate, precise_bn, num_iters_precise_bn,
+                     world_size):
+    batch_size = cfg.DATASET.get('batch_size', 2)
+    train_dataset = build_dataset((cfg.DATASET.train, cfg.PIPELINE.train))
+    precise_bn_dataloader_setting = dict(
+        batch_size=batch_size,
+        num_workers=cfg.DATASET.get('num_workers', 0),
+        places=places,
+    )
+    if precise_bn:
+        cfg.DATASET.train.num_samples_precise_bn = num_iters_precise_bn * batch_size * world_size
+        precise_bn_dataset = build_dataset((cfg.DATASET.train,
+                                            cfg.PIPELINE.train))
+        precise_bn_loader = build_dataloader(precise_bn_dataset,
+                                             **precise_bn_dataloader_setting)
+        cfg.DATASET.train.num_samples_precise_bn = None
+    else:
+        precise_bn_loader = None
+
+    if cfg.MULTIGRID.SHORT_CYCLE:
+        # get batch size list in short cycle schedule
+        bs_factor = [
+            int(
+                round((float(cfg.PIPELINE.train.transform[1]['MultiCrop'][
+                    'target_size']) / (s * cfg.MULTIGRID.default_crop_size))
+                      **2)) for s in cfg.MULTIGRID.short_cycle_factors
+        ]
+        batch_sizes = [
+            batch_size * bs_factor[0],
+            batch_size * bs_factor[1],
+            batch_size,
+        ]
+        train_dataloader_setting = dict(
+            batch_size=batch_sizes,
+            multigrid=True,
+            num_workers=cfg.DATASET.get('num_workers', 0),
+            places=places,
+        )
+    else:
+        train_dataloader_setting = precise_bn_dataloader_setting
+
+    train_loader = build_dataloader(train_dataset, **train_dataloader_setting)
+    if validate:
+        valid_dataset = build_dataset((cfg.DATASET.valid, cfg.PIPELINE.valid))
+        validate_dataloader_setting = dict(
+            batch_size=batch_size,
+            num_workers=cfg.DATASET.get('num_workers', 0),
+            places=places,
+            drop_last=False,
+            shuffle=False)
+        valid_loader = build_dataloader(valid_dataset,
+                                        **validate_dataloader_setting)
+    else:
+        valid_loader = None
+
+    return train_loader, valid_loader, precise_bn_loader
+
+
+def build_trainer(cfg, places, parallel, validate, precise_bn,
+                  num_iters_precise_bn, world_size):
+    """
+    Build training model and its associated tools, including optimizer,
+    dataloaders and meters.
+    Args:
+        cfg (CfgNode): configs.
+    Returns:
+        model: training model.
+        optimizer: optimizer.
+        train_loader: training data loader.
+        val_loader: validatoin data loader.
+        precise_bn_loader: training data loader for computing
+            precise BN.
+    """
+    model = build_model(cfg.MODEL)
+    if parallel:
+        model = paddle.DataParallel(model)
+
+    train_loader, valid_loader, precise_bn_loader = \
+        construct_loader(cfg,
+                         places,
+                         validate,
+                         precise_bn,
+                         num_iters_precise_bn,
+                         world_size,
+                         )
+
+    lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader))
+    optimizer = build_optimizer(cfg.OPTIMIZER, lr, model=model)
+
+    return (
+        model,
+        lr,
+        optimizer,
+        train_loader,
+        valid_loader,
+        precise_bn_loader,
+    )
+
+
+def train_model_multigrid(cfg, world_size=1, validate=True):
+    """Train model entry
+
+    Args:
+    	cfg (dict): configuration.
+    	parallel (bool): Whether multi-card training. Default: True
+        validate (bool): Whether to do evaluation. Default: False.
+
+    """
+    # Init multigrid.
+    multigrid = None
+    if cfg.MULTIGRID.LONG_CYCLE or cfg.MULTIGRID.SHORT_CYCLE:
+        multigrid = MultigridSchedule()
+        cfg = multigrid.init_multigrid(cfg)
+        if cfg.MULTIGRID.LONG_CYCLE:
+            cfg, _ = multigrid.update_long_cycle(cfg, cur_epoch=0)
+    multi_save_epoch = [i[-1] - 1 for i in multigrid.schedule]
+
+    parallel = world_size != 1
+    logger = get_logger("paddlevideo")
+    batch_size = cfg.DATASET.get('batch_size', 2)
+
+    if cfg.get('use_npu', False):
+        places = paddle.set_device('npu')
+    elif cfg.get('use_xpu', False):
+        places = paddle.set_device('xpu')
+    else:
+        places = paddle.set_device('gpu')
+
+    model_name = cfg.model_name
+    output_dir = cfg.get("output_dir", f"./output/{model_name}")
+    mkdir(output_dir)
+    local_rank = dist.ParallelEnv().local_rank
+    precise_bn = cfg.get("PRECISEBN")
+    num_iters_precise_bn = cfg.PRECISEBN.num_iters_preciseBN
+
+    # 1. Construct model
+    model = build_model(cfg.MODEL)
+    if parallel:
+        model = paddle.DataParallel(model)
+
+    # 2. Construct dataloader
+    train_loader, valid_loader, precise_bn_loader = \
+        construct_loader(cfg,
+                         places,
+                         validate,
+                         precise_bn,
+                         num_iters_precise_bn,
+                         world_size,
+                         )
+
+    # 3. Construct optimizer
+    lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader))
+    optimizer = build_optimizer(
+        cfg.OPTIMIZER, lr, parameter_list=model.parameters())
+
+    # Resume
+    resume_epoch = cfg.get("resume_epoch", 0)
+    if resume_epoch:
+        filename = osp.join(
+            output_dir,
+            model_name + str(local_rank) + '_' + f"{resume_epoch:05d}")
+        subn_load(model, filename, optimizer)
+
+    # 4. Train Model
+    best = 0.
+    total_epochs = int(cfg.epochs * cfg.MULTIGRID.epoch_factor)
+    for epoch in range(total_epochs):
+        if epoch < resume_epoch:
+            logger.info(
+                f"| epoch: [{epoch+1}] <= resume_epoch: [{ resume_epoch}], continue... "
+            )
+            continue
+
+        if cfg.MULTIGRID.LONG_CYCLE:
+            cfg, changed = multigrid.update_long_cycle(cfg, epoch)
+            if changed:
+                logger.info("====== Rebuild model/optimizer/loader =====")
+                (
+                    model,
+                    lr,
+                    optimizer,
+                    train_loader,
+                    valid_loader,
+                    precise_bn_loader,
+                ) = build_trainer(cfg, places, parallel, validate, precise_bn,
+                                  num_iters_precise_bn, world_size)
+
+                #load checkpoint after re-build model
+                if epoch != 0:
+                    #epoch no need to -1, haved add 1 when save
+                    filename = osp.join(
+                        output_dir,
+                        model_name + str(local_rank) + '_' + f"{(epoch):05d}")
+                    subn_load(model, filename, optimizer)
+                #update lr last epoch, not to use saved params
+                lr.last_epoch = epoch
+                lr.step(rebuild=True)
+
+        model.train()
+        record_list = build_record(cfg.MODEL)
+        tic = time.time()
+        for i, data in enumerate(train_loader):
+            record_list['reader_time'].update(time.time() - tic)
+            # 4.1 forward
+            outputs = model(data, mode='train')
+            # 4.2 backward
+            avg_loss = outputs['loss']
+            avg_loss.backward()
+            # 4.3 minimize
+            optimizer.step()
+            optimizer.clear_grad()
+
+            # log record
+            record_list['lr'].update(
+                float(optimizer._global_learning_rate()), batch_size)
+            for name, value in outputs.items():
+                record_list[name].update(float(value), batch_size)
+            record_list['batch_time'].update(time.time() - tic)
+            tic = time.time()
+
+            if i % cfg.get("log_interval", 10) == 0:
+                ips = "ips: {:.5f} instance/sec.".format(
+                    batch_size / record_list["batch_time"].val)
+                log_batch(record_list, i, epoch + 1, total_epochs, "train", ips)
+
+            # learning rate iter step
+            if cfg.OPTIMIZER.learning_rate.get("iter_step"):
+                lr.step()
+
+        # learning rate epoch step
+        if not cfg.OPTIMIZER.learning_rate.get("iter_step"):
+            lr.step()
+
+        ips = "ips: {:.5f} instance/sec.".format(
+            batch_size * record_list["batch_time"].count /
+            record_list["batch_time"].sum)
+        log_epoch(record_list, epoch + 1, "train", ips)
+
+        def evaluate(best):
+            model.eval()
+            record_list = build_record(cfg.MODEL)
+            record_list.pop('lr')
+            tic = time.time()
+            for i, data in enumerate(valid_loader):
+                outputs = model(data, mode='valid')
+
+                # log_record
+                for name, value in outputs.items():
+                    record_list[name].update(float(value), batch_size)
+
+                record_list['batch_time'].update(time.time() - tic)
+                tic = time.time()
+
+                if i % cfg.get("log_interval", 10) == 0:
+                    ips = "ips: {:.5f} instance/sec.".format(
+                        batch_size / record_list["batch_time"].val)
+                    log_batch(record_list, i, epoch + 1, total_epochs, "val",
+                              ips)
+
+            ips = "ips: {:.5f} instance/sec.".format(
+                batch_size * record_list["batch_time"].count /
+                record_list["batch_time"].sum)
+            log_epoch(record_list, epoch + 1, "val", ips)
+
+            best_flag = False
+            if record_list.get('top1') and record_list['top1'].avg > best:
+                best = record_list['top1'].avg
+                best_flag = True
+            return best, best_flag
+
+        # use precise bn to improve acc
+        if is_eval_epoch(cfg, epoch, total_epochs, multigrid.schedule):
+            logger.info(f"do precise BN in {epoch+1} ...")
+            do_preciseBN(model, precise_bn_loader, parallel,
+                         min(num_iters_precise_bn, len(precise_bn_loader)))
+
+        #  aggregate sub_BN stats
+        logger.info("Aggregate sub_BatchNorm stats...")
+        aggregate_sub_bn_stats(model)
+
+        # 5. Validation
+        if is_eval_epoch(cfg, epoch, total_epochs, multigrid.schedule):
+            logger.info(f"eval in {epoch+1} ...")
+            with paddle.no_grad():
+                best, save_best_flag = evaluate(best)
+            # save best
+            if save_best_flag:
+                save(optimizer.state_dict(),
+                     osp.join(output_dir, model_name + "_best.pdopt"))
+                save(model.state_dict(),
+                     osp.join(output_dir, model_name + "_best.pdparams"))
+                logger.info(
+                    f"Already save the best model (top1 acc){int(best * 10000) / 10000}"
+                )
+
+        # 6. Save model and optimizer
+        if is_eval_epoch(
+                cfg, epoch,
+                total_epochs, multigrid.schedule) or epoch % cfg.get(
+                    "save_interval", 10) == 0 or epoch in multi_save_epoch:
+            logger.info("[Save parameters] ======")
+            subn_save(output_dir, model_name + str(local_rank) + '_', epoch + 1,
+                      model, optimizer)
+
+    logger.info(f'training {model_name} finished')
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/__init__.py b/Bank_second_part/detect_process/paddlevideo/utils/__init__.py
new file mode 100644
index 0000000..d18561d
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/utils/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .registry import Registry
+from .build_utils import build
+from .config import *
+from .logger import setup_logger, coloring, get_logger
+from .record import AverageMeter, build_record, log_batch, log_epoch
+from .dist_utils import get_dist_info, main_only
+from .save_load import save, load, load_ckpt, mkdir
+from .precise_bn import do_preciseBN
+from .profiler import add_profiler_step
+__all__ = ['Registry', 'build']
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..64ae223
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/build_utils.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/build_utils.cpython-310.pyc
new file mode 100644
index 0000000..273693d
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/build_utils.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/config.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/config.cpython-310.pyc
new file mode 100644
index 0000000..7d3a10e
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/config.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/dist_utils.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/dist_utils.cpython-310.pyc
new file mode 100644
index 0000000..c43503f
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/dist_utils.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/logger.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/logger.cpython-310.pyc
new file mode 100644
index 0000000..30719bd
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/logger.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/precise_bn.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/precise_bn.cpython-310.pyc
new file mode 100644
index 0000000..79ae684
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/precise_bn.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/profiler.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/profiler.cpython-310.pyc
new file mode 100644
index 0000000..1287287
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/profiler.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/record.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/record.cpython-310.pyc
new file mode 100644
index 0000000..cc8857c
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/record.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/registry.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/registry.cpython-310.pyc
new file mode 100644
index 0000000..56450c3
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/registry.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/save_load.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/save_load.cpython-310.pyc
new file mode 100644
index 0000000..62b1269
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/save_load.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/build_utils.py b/Bank_second_part/detect_process/paddlevideo/utils/build_utils.py
new file mode 100644
index 0000000..73c0ca4
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/utils/build_utils.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def build(cfg, registry, key='name'):
+    """Build a module from config dict.
+    Args:
+        cfg (dict): Config dict. It should at least contain the key.
+        registry (XXX): The registry to search the type from.
+        key (str): the key.
+    Returns:
+        obj: The constructed object.
+    """
+
+    assert isinstance(cfg, dict) and key in cfg
+
+    cfg_copy = cfg.copy()
+    obj_type = cfg_copy.pop(key)
+
+    obj_cls = registry.get(obj_type)
+    if obj_cls is None:
+        raise KeyError('{} is not in the {} registry'.format(
+                obj_type, registry.name))
+    return obj_cls(**cfg_copy)
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/config.py b/Bank_second_part/detect_process/paddlevideo/utils/config.py
new file mode 100644
index 0000000..f4d7941
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/utils/config.py
@@ -0,0 +1,174 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import yaml
+from paddlevideo.utils.logger import coloring, get_logger, setup_logger
+
+__all__ = ['get_config']
+
+logger = setup_logger("./", name="paddlevideo", level="INFO")
+
+
+class AttrDict(dict):
+    def __getattr__(self, key):
+        return self[key]
+
+    def __setattr__(self, key, value):
+        if key in self.__dict__:
+            self.__dict__[key] = value
+        else:
+            self[key] = value
+
+
+def create_attr_dict(yaml_config):
+    from ast import literal_eval
+    for key, value in yaml_config.items():
+        if type(value) is dict:
+            yaml_config[key] = value = AttrDict(value)
+        if isinstance(value, str):
+            try:
+                value = literal_eval(value)
+            except BaseException:
+                pass
+        if isinstance(value, AttrDict):
+            create_attr_dict(yaml_config[key])
+        else:
+            yaml_config[key] = value
+
+
+def parse_config(cfg_file):
+    """Load a config file into AttrDict"""
+    with open(cfg_file, 'r') as fopen:
+        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.SafeLoader))
+    create_attr_dict(yaml_config)
+    return yaml_config
+
+
+def print_dict(d, delimiter=0):
+    """
+    Recursively visualize a dict and
+    indenting acrrording by the relationship of keys.
+    """
+    placeholder = "-" * 60
+    for k, v in sorted(d.items()):
+        if isinstance(v, dict):
+            logger.info("{}{} : ".format(delimiter * " ", coloring(k,
+                                                                   "HEADER")))
+            print_dict(v, delimiter + 4)
+        elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):
+            logger.info("{}{} : ".format(delimiter * " ",
+                                         coloring(str(k), "HEADER")))
+            for value in v:
+                print_dict(value, delimiter + 4)
+        else:
+            logger.info("{}{} : {}".format(delimiter * " ",
+                                           coloring(k, "HEADER"),
+                                           coloring(v, "OKGREEN")))
+
+        if k.isupper():
+            logger.info(placeholder)
+
+
+def print_config(config):
+    """
+    visualize configs
+    Arguments:
+        config: configs
+    """
+    print_dict(config)
+
+
+def check_config(config):
+    """
+    Check config
+    """
+    pass
+
+
+def override(dl, ks, v):
+    """
+    Recursively replace dict of list
+    Args:
+        dl(dict or list): dict or list to be replaced
+        ks(list): list of keys
+        v(str): value to be replaced
+    """
+    def str2num(v):
+        try:
+            return eval(v)
+        except Exception:
+            return v
+
+    assert isinstance(dl, (list, dict)), ("{} should be a list or a dict")
+    assert len(ks) > 0, ('lenght of keys should larger than 0')
+    if isinstance(dl, list):
+        k = str2num(ks[0])
+        if len(ks) == 1:
+            assert k < len(dl), ('index({}) out of range({})'.format(k, dl))
+            dl[k] = str2num(v)
+        else:
+            override(dl[k], ks[1:], v)
+    else:
+        if len(ks) == 1:
+            #assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))
+            if not ks[0] in dl:
+                logger.warning('A new filed ({}) detected!'.format(ks[0], dl))
+            dl[ks[0]] = str2num(v)
+        else:
+            assert ks[0] in dl, (
+                '({}) doesn\'t exist in {}, a new dict field is invalid'.format(
+                    ks[0], dl))
+            override(dl[ks[0]], ks[1:], v)
+
+
+def override_config(config, options=None):
+    """
+    Recursively override the config
+    Args:
+        config(dict): dict to be replaced
+        options(list): list of pairs(key0.key1.idx.key2=value)
+            such as: [
+                epochs=20',
+                'PIPELINE.train.transform.1.ResizeImage.resize_short=300'
+            ]
+    Returns:
+        config(dict): replaced config
+    """
+    if options is not None:
+        for opt in options:
+            assert isinstance(opt,
+                              str), ("option({}) should be a str".format(opt))
+            assert "=" in opt, (
+                "option({}) should contain a ="
+                "to distinguish between key and value".format(opt))
+            pair = opt.split('=')
+            assert len(pair) == 2, ("there can be only a = in the option")
+            key, value = pair
+            keys = key.split('.')
+            override(config, keys, value)
+
+    return config
+
+
+def get_config(fname, overrides=None, show=True):
+    """
+    Read config from file
+    """
+    assert os.path.exists(fname), ('config file({}) is not exist'.format(fname))
+    config = parse_config(fname)
+    override_config(config, overrides)
+    if show:
+        print_config(config)
+    check_config(config)
+    return config
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/dist_utils.py b/Bank_second_part/detect_process/paddlevideo/utils/dist_utils.py
new file mode 100644
index 0000000..7659e88
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/utils/dist_utils.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+
+import paddle
+import paddle.distributed as dist
+
+def get_dist_info():
+    world_size = dist.get_world_size()
+    rank = dist.get_rank()
+    return rank, world_size
+
+def main_only(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        rank, _ = get_dist_info()
+        if rank == 0:
+            return func(*args, **kwargs)
+    return wrapper
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/logger.py b/Bank_second_part/detect_process/paddlevideo/utils/logger.py
new file mode 100644
index 0000000..e9791b8
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/utils/logger.py
@@ -0,0 +1,113 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import datetime
+
+from paddle.distributed import ParallelEnv
+
+
+
+Color = {
+    'RED': '\033[31m',
+    'HEADER': '\033[35m',  # deep purple
+    'PURPLE': '\033[95m',  # purple
+    'OKBLUE': '\033[94m',
+    'OKGREEN': '\033[92m',
+    'WARNING': '\033[93m',
+    'FAIL': '\033[91m',
+    'ENDC': '\033[0m'
+}
+
+
+def coloring(message, color="OKGREEN"):
+    assert color in Color.keys()
+    if os.environ.get('COLORING', True):
+        return Color[color] + str(message) + Color["ENDC"]
+    else:
+        return message
+
+
+logger_initialized = []
+
+
+def setup_logger(output=None, name="paddlevideo", level="INFO"):
+    """
+    Initialize the paddlevideo logger and set its verbosity level to "INFO".
+    Args:
+        output (str): a file name or a directory to save log. If None, will not save log file.
+            If ends with ".txt" or ".log", assumed to be a file name.
+            Otherwise, logs will be saved to `output/log.txt`.
+        name (str): the root module name of this logger
+    Returns:
+        logging.Logger: a logger
+    """
+    def time_zone(sec, fmt):
+        real_time = datetime.datetime.now()
+        return real_time.timetuple()
+    logging.Formatter.converter = time_zone
+
+    logger = logging.getLogger(name)
+    if level == "INFO":
+        logger.setLevel(logging.INFO)
+    elif level=="DEBUG":
+        logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+
+    if level == "DEBUG":
+        plain_formatter = logging.Formatter(
+            "[%(asctime)s] %(name)s %(levelname)s: %(message)s",
+            datefmt="%m/%d %H:%M:%S")
+    else:
+        plain_formatter = logging.Formatter(
+            "[%(asctime)s] %(message)s",
+            datefmt="%m/%d %H:%M:%S")
+    # stdout logging: master only
+    local_rank = ParallelEnv().local_rank
+    if local_rank == 0:
+        ch = logging.StreamHandler(stream=sys.stdout)
+        ch.setLevel(logging.DEBUG)
+        formatter = plain_formatter
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+
+    # file logging: all workers
+    if output is not None:
+        if output.endswith(".txt") or output.endswith(".log"):
+            filename = output
+        else:
+            filename = os.path.join(output, ".log.txt")
+        if local_rank > 0:
+            filename = filename + ".rank{}".format(local_rank)
+
+        # PathManager.mkdirs(os.path.dirname(filename))
+        os.makedirs(os.path.dirname(filename), exist_ok=True)
+
+        # fh = logging.StreamHandler(_cached_log_stream(filename)
+        fh = logging.FileHandler(filename, mode='a')
+        fh.setLevel(logging.DEBUG)
+        fh.setFormatter(plain_formatter)
+        logger.addHandler(fh)
+    logger_initialized.append(name)
+    return logger
+
+
+def get_logger(name, output=None):
+    logger = logging.getLogger(name)
+    if name in logger_initialized:
+        return logger
+
+    return setup_logger(name=name, output=name)
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__init__.py b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__init__.py
new file mode 100644
index 0000000..10295b5
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__init__.py
@@ -0,0 +1,10 @@
+from .multigrid import MultigridSchedule
+from .batchnorm_helper import get_norm, aggregate_sub_bn_stats
+from .short_sampler import DistributedShortSampler
+from .save_load_helper import subn_save, subn_load
+from .interval_helper import is_eval_epoch
+
+__all__ = [
+    'MultigridSchedule', 'get_norm', 'aggregate_sub_bn_stats',
+    'DistributedShortSampler', 'subn_save', 'subn_load', 'is_eval_epoch'
+]
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..dc3ea59
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/batchnorm_helper.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/batchnorm_helper.cpython-310.pyc
new file mode 100644
index 0000000..05cf1e8
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/batchnorm_helper.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/interval_helper.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/interval_helper.cpython-310.pyc
new file mode 100644
index 0000000..ef20957
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/interval_helper.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/multigrid.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/multigrid.cpython-310.pyc
new file mode 100644
index 0000000..64dba74
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/multigrid.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/save_load_helper.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/save_load_helper.cpython-310.pyc
new file mode 100644
index 0000000..68e467e
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/save_load_helper.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/short_sampler.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/short_sampler.cpython-310.pyc
new file mode 100644
index 0000000..de4dd56
Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/short_sampler.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/batchnorm_helper.py b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/batchnorm_helper.py
new file mode 100644
index 0000000..e39b067
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/batchnorm_helper.py
@@ -0,0 +1,142 @@
+from functools import partial
+import paddle
+
+
+def get_norm(bn_norm_type, bn_num_splits):
+    """
+    Args:
+        cfg (CfgNode): model building configs, details are in the comments of
+            the config file.
+    Returns:
+        nn.Layer: the normalization layer.
+    """
+    if bn_norm_type == "batchnorm":
+        return paddle.nn.BatchNorm3D
+    elif bn_norm_type == "sub_batchnorm":
+        return partial(SubBatchNorm3D, num_splits=bn_num_splits)
+    else:
+        raise NotImplementedError(
+            "Norm type {} is not supported".format(bn_norm_type))
+
+
+def aggregate_sub_bn_stats(model):
+    """
+    Recursively find all SubBN modules and aggregate sub-BN stats.
+    Args:
+        model (nn.Layer): model to be aggregate sub-BN stats
+    Returns:
+        count (int): number of SubBN module found.
+    """
+    count = 0
+    for child in model.children():
+        if isinstance(child, SubBatchNorm3D):
+            child.aggregate_stats()
+            count += 1
+        else:
+            count += aggregate_sub_bn_stats(child)
+    return count
+
+
+class SubBatchNorm3D(paddle.nn.Layer):
+    """
+    Implement based on paddle2.0.
+    The standard BN layer computes stats across all examples in a GPU. In some
+    cases it is desirable to compute stats across only a subset of examples
+    SubBatchNorm3D splits the batch dimension into N splits, and run BN on
+    each of them separately (so that the stats are computed on each subset of
+    examples (1/N of batch) independently. During evaluation, it aggregates
+    the stats from all splits into one BN.
+    """
+    def __init__(self, num_splits, **args):
+        """
+        Args:
+            num_splits (int): number of splits.
+            args (list): list of args
+        """
+        super(SubBatchNorm3D, self).__init__()
+        self.num_splits = num_splits
+        self.num_features = args["num_features"]
+        self.weight_attr = args["weight_attr"]
+        self.bias_attr = args["bias_attr"]
+
+        # Keep only one set of weight and bias (outside).
+        if self.weight_attr == False:
+            self.weight = self.create_parameter(
+                attr=None,
+                shape=[self.num_features],
+                default_initializer=paddle.nn.initializer.Constant(1.0))
+            self.weight.stop_gradient = True
+        else:
+            self.weight = self.create_parameter(
+                attr=self.weight_attr,
+                shape=[self.num_features],
+                default_initializer=paddle.nn.initializer.Constant(1.0))
+            self.weight.stop_gradient = self.weight_attr is not None \
+                                        and self.weight_attr.learning_rate == 0.
+
+        if self.bias_attr == False:
+            self.bias = self.create_parameter(attr=None,
+                                              shape=[self.num_features],
+                                              is_bias=True)
+            self.bias.stop_gradient = True
+        else:
+            self.bias = self.create_parameter(attr=self.bias_attr,
+                                              shape=[self.num_features],
+                                              is_bias=True)
+            self.bias.stop_gradient = self.bias_attr is not None \
+                                      and self.bias_attr.learning_rate == 0.
+
+        # set weights and bias fixed (inner).
+        args["weight_attr"] = False
+        args["bias_attr"] = False
+        self.bn = paddle.nn.BatchNorm3D(**args)
+        # update number of features used in split_bn
+        args["num_features"] = self.num_features * self.num_splits
+        self.split_bn = paddle.nn.BatchNorm3D(**args)
+
+    def _get_aggregated_mean_std(self, means, stds, n):
+        """
+        Calculate the aggregated mean and stds.
+        Use the method of update mean and std when merge multi-part data.
+        Args:
+            means (tensor): mean values.
+            stds (tensor): standard deviations.
+            n (int): number of sets of means and stds.
+        """
+        mean = paddle.sum(paddle.reshape(means, (n, -1)), axis=0) / n
+        std = (paddle.sum(paddle.reshape(stds, (n, -1)), axis=0) / n +
+               paddle.sum(paddle.reshape(
+                   paddle.pow((paddle.reshape(means, (n, -1)) - mean), 2),
+                   (n, -1)),
+                          axis=0) / n)
+        return mean, std
+
+    def aggregate_stats(self):
+        """
+        Synchronize running_mean, and running_var to self.bn.
+        Call this before eval, then call model.eval();
+        When eval, forward function will call self.bn instead of self.split_bn,
+        During this time the running_mean, and running_var of self.bn has been obtained from
+        self.split_bn.
+        """
+        if self.split_bn.training:
+            bn_mean_tensor, bn_variance_tensor = self._get_aggregated_mean_std(
+                self.split_bn._mean,
+                self.split_bn._variance,
+                self.num_splits,
+            )
+            self.bn._mean.set_value(bn_mean_tensor)
+            self.bn._variance.set_value(bn_variance_tensor)
+
+    def forward(self, x):
+        if self.training:
+            n, c, t, h, w = x.shape
+            x = paddle.reshape(
+                x, (n // self.num_splits, c * self.num_splits, t, h, w))
+            x = self.split_bn(x)
+            x = paddle.reshape(x, (n, c, t, h, w))
+        else:
+            x = self.bn(x)
+        x = paddle.multiply(x, paddle.reshape(self.weight, (-1, 1, 1, 1)))
+        x = paddle.add(x, paddle.reshape(self.bias, (-1, 1, 1, 1)))
+        return x
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/interval_helper.py b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/interval_helper.py
new file mode 100644
index 0000000..2df4bc7
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/interval_helper.py
@@ -0,0 +1,19 @@
+def is_eval_epoch(cfg, cur_epoch, total_epochs, multigrid_schedule):
+    """
+    Determine if the model should be evaluated at the current epoch.
+    Args:
+        cfg (CfgNode): configs. Details can be found in
+            slowfast/config/defaults.py
+        cur_epoch (int): current epoch.
+        multigrid_schedule (List): schedule for multigrid training.
+    """
+    if cur_epoch + 1 == total_epochs:
+        return True
+    if multigrid_schedule is not None:
+        prev_epoch = 0
+        for s in multigrid_schedule:
+            if cur_epoch < s[-1]:
+                period = max(
+                    (s[-1] - prev_epoch) // cfg.MULTIGRID.EVAL_FREQ + 1, 1)
+                return (s[-1] - 1 - cur_epoch) % period == 0
+            prev_epoch = s[-1]
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/multigrid.py b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/multigrid.py
new file mode 100644
index 0000000..a296a06
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/multigrid.py
@@ -0,0 +1,233 @@
+"""Functions for multigrid training."""
+
+import numpy as np
+
+
+class MultigridSchedule(object):
+    """
+    This class defines multigrid training schedule and update cfg accordingly.
+    """
+    def init_multigrid(self, cfg):
+        """
+        Update cfg based on multigrid settings.
+        Args:
+            cfg (configs): configs that contains training and multigrid specific
+                hyperparameters.
+        Returns:
+            cfg (configs): the updated cfg.
+        """
+        self.schedule = None
+        # We may modify cfg.DATASET.batch_size, cfg.PIPELINE.train.decode_sampler.num_frames, and
+        # cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size'] during training, so we store their original
+        # value in cfg and use them as global variables.
+        cfg.MULTIGRID.default_batch_size = cfg.DATASET.batch_size  # total bs,64
+        cfg.MULTIGRID.default_temporal_size = cfg.PIPELINE.train.decode_sampler.num_frames  # 32
+        cfg.MULTIGRID.default_crop_size = cfg.PIPELINE.train.transform[1][
+            'MultiCrop']['target_size']  # 224
+
+        if cfg.MULTIGRID.LONG_CYCLE:
+            self.schedule = self.get_long_cycle_schedule(cfg)
+            cfg.OPTIMIZER.learning_rate.steps = [0] + [
+                s[-1] for s in self.schedule
+            ]
+            # Fine-tuning phase.
+            cfg.OPTIMIZER.learning_rate.steps[-1] = (
+                cfg.OPTIMIZER.learning_rate.steps[-2] +
+                cfg.OPTIMIZER.learning_rate.steps[-1]) // 2
+            cfg.OPTIMIZER.learning_rate.lrs = [
+                cfg.OPTIMIZER.learning_rate.gamma**s[0] * s[1][0]
+                for s in self.schedule
+            ]
+            # Fine-tuning phase.
+            cfg.OPTIMIZER.learning_rate.lrs = cfg.OPTIMIZER.learning_rate.lrs[:-1] + [
+                cfg.OPTIMIZER.learning_rate.lrs[-2],
+                cfg.OPTIMIZER.learning_rate.lrs[-1],
+            ]
+
+            cfg.OPTIMIZER.learning_rate.max_epoch = self.schedule[-1][-1]
+
+        elif cfg.MULTIGRID.SHORT_CYCLE:
+            cfg.OPTIMIZER.learning_rate.steps = [
+                int(s * cfg.MULTIGRID.epoch_factor)
+                for s in cfg.OPTIMIZER.learning_rate.steps
+            ]
+            cfg.OPTIMIZER.learning_rate.max_epoch = int(
+                cfg.OPTIMIZER.learning_rate.max_epoch *
+                cfg.OPTIMIZER.learning_rate.max_epoch)
+        return cfg
+
+    def update_long_cycle(self, cfg, cur_epoch):
+        """
+        Before every epoch, check if long cycle shape should change. If it
+            should, update cfg accordingly.
+        Args:
+            cfg (configs): configs that contains training and multigrid specific
+                hyperparameters.
+            cur_epoch (int): current epoch index.
+        Returns:
+            cfg (configs): the updated cfg.
+            changed (bool): whether to change long cycle shape at this epoch
+        """
+        base_b, base_t, base_s = get_current_long_cycle_shape(
+            self.schedule, cur_epoch)
+        if base_s != cfg.PIPELINE.train.transform[1]['MultiCrop'][
+                'target_size'] or base_t != cfg.PIPELINE.train.decode_sampler.num_frames:
+            #NOTE Modify
+            # no need to modify, used by pool_size in head, None when multigrid
+            # cfg.MODEL.head.num_frames = base_t
+            # cfg.MODEL.head.crop_size  = base_s
+            cfg.PIPELINE.train.decode_sampler.num_frames = base_t
+            cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size'] = base_s
+            cfg.DATASET.batch_size = base_b * cfg.MULTIGRID.default_batch_size  #change bs
+
+            bs_factor = (float(cfg.DATASET.batch_size) /
+                         cfg.MULTIGRID.bn_base_size)
+
+            if bs_factor == 1:  #single bs == bn_base_size (== 8)
+                cfg.MODEL.backbone.bn_norm_type = "batchnorm"
+            else:
+                cfg.MODEL.backbone.bn_norm_type = "sub_batchnorm"
+                cfg.MODEL.backbone.bn_num_splits = int(bs_factor)
+
+            cfg.MULTIGRID.long_cycle_sampling_rate = cfg.PIPELINE.train.decode_sampler.sampling_rate * (
+                cfg.MULTIGRID.default_temporal_size // base_t)
+            print("Long cycle updates:")
+            print("\tbn_norm_type: {}".format(cfg.MODEL.backbone.bn_norm_type))
+            if cfg.MODEL.backbone.bn_norm_type == "sub_batchnorm":
+                print("\tbn_num_splits: {}".format(
+                    cfg.MODEL.backbone.bn_num_splits))
+            print("\tTRAIN.batch_size[single card]: {}".format(
+                cfg.DATASET.batch_size))
+            print("\tDATA.NUM_FRAMES x LONG_CYCLE_SAMPLING_RATE: {}x{}".format(
+                cfg.PIPELINE.train.decode_sampler.num_frames,
+                cfg.MULTIGRID.long_cycle_sampling_rate))
+            print("\tDATA.train_crop_size: {}".format(
+                cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size']))
+            return cfg, True
+        else:
+            return cfg, False
+
+    def get_long_cycle_schedule(self, cfg):
+        """
+        Based on multigrid hyperparameters, define the schedule of a long cycle.
+        Args:
+            cfg (configs): configs that contains training and multigrid specific
+                hyperparameters.
+        Returns:
+            schedule (list): Specifies a list long cycle base shapes and their
+                corresponding training epochs.
+        """
+
+        steps = cfg.OPTIMIZER.learning_rate.steps
+
+        default_size = float(
+            cfg.PIPELINE.train.decode_sampler.num_frames *
+            cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size']**
+            2)  # 32 * 224 * 224  C*H*W
+        default_iters = steps[-1]  # 196
+
+        # Get shapes and average batch size for each long cycle shape.
+        avg_bs = []
+        all_shapes = []
+        #        for t_factor, s_factor in cfg.MULTIGRID.long_cycle_factors:
+        for item in cfg.MULTIGRID.long_cycle_factors:
+            t_factor, s_factor = item["value"]
+            base_t = int(
+                round(cfg.PIPELINE.train.decode_sampler.num_frames * t_factor))
+            base_s = int(
+                round(
+                    cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size']
+                    * s_factor))
+            if cfg.MULTIGRID.SHORT_CYCLE:
+                shapes = [
+                    [
+                        base_t,
+                        cfg.MULTIGRID.default_crop_size *
+                        cfg.MULTIGRID.short_cycle_factors[0],
+                    ],
+                    [
+                        base_t,
+                        cfg.MULTIGRID.default_crop_size *
+                        cfg.MULTIGRID.short_cycle_factors[1],
+                    ],
+                    [base_t, base_s],
+                ]  #first two is short_cycle, last is the base long_cycle
+            else:
+                shapes = [[base_t, base_s]]
+
+            # (T, S) -> (B, T, S)
+            shapes = [[
+                int(round(default_size / (s[0] * s[1] * s[1]))), s[0], s[1]
+            ] for s in shapes]
+            avg_bs.append(np.mean([s[0] for s in shapes]))
+            all_shapes.append(shapes)
+
+        # Get schedule regardless of cfg.MULTIGRID.epoch_factor.
+        total_iters = 0
+        schedule = []
+        for step_index in range(len(steps) - 1):
+            step_epochs = steps[step_index + 1] - steps[step_index]
+
+            for long_cycle_index, shapes in enumerate(all_shapes):
+                #ensure each of 4 sequences run the same num of iters
+                cur_epochs = (step_epochs * avg_bs[long_cycle_index] /
+                              sum(avg_bs))
+
+                # get cur_iters from cur_epochs
+                cur_iters = cur_epochs / avg_bs[long_cycle_index]
+                total_iters += cur_iters
+                schedule.append((step_index, shapes[-1], cur_epochs))
+
+        iter_saving = default_iters / total_iters  # ratio between default iters and real iters
+
+        final_step_epochs = cfg.OPTIMIZER.learning_rate.max_epoch - steps[-1]
+
+        # We define the fine-tuning phase to have the same amount of iteration
+        # saving as the rest of the training.
+        #final_step_epochs / iter_saving make fine-tune having the same iters as training
+        ft_epochs = final_step_epochs / iter_saving * avg_bs[-1]
+
+        #        schedule.append((step_index + 1, all_shapes[-1][2], ft_epochs))
+        schedule.append((step_index + 1, all_shapes[-1][-1], ft_epochs))
+
+        # Obtrain final schedule given desired cfg.MULTIGRID.epoch_factor.
+        x = (cfg.OPTIMIZER.learning_rate.max_epoch *
+             cfg.MULTIGRID.epoch_factor / sum(s[-1] for s in schedule))
+
+        final_schedule = []
+        total_epochs = 0
+        for s in schedule:
+            epochs = s[2] * x
+            total_epochs += epochs
+            final_schedule.append((s[0], s[1], int(round(total_epochs))))
+        print_schedule(final_schedule)
+        return final_schedule
+
+
+def print_schedule(schedule):
+    """
+    Log schedule.
+    """
+    print(
+        "Long_cycle_index\tBase_shape(bs_factor,temporal_size,crop_size)\tEpochs"
+    )
+    for s in schedule:
+        print("{}\t\t\t{}\t\t\t\t\t{}".format(s[0], s[1], s[2]))
+
+
+def get_current_long_cycle_shape(schedule, epoch):
+    """
+    Given a schedule and epoch index, return the long cycle base shape.
+    Args:
+        schedule (configs): configs that contains training and multigrid specific
+            hyperparameters.
+        cur_epoch (int): current epoch index.
+    Returns:
+        shapes (list): A list describing the base shape in a long cycle:
+            [batch size relative to default,
+            number of frames, spatial dimension].
+    """
+    for s in schedule:
+        if epoch < s[-1]:
+            return s[1]
+    return schedule[-1][1]
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/save_load_helper.py b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/save_load_helper.py
new file mode 100644
index 0000000..94a52d5
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/save_load_helper.py
@@ -0,0 +1,237 @@
+import os
+import numpy as np
+import paddle
+import copy
+
+
+def sub_to_normal_bn(sd):
+    """
+    When save, Convert the Sub-BN paprameters to normal BN parameters in a state dict.
+    There are two copies of BN layers in a Sub-BN implementation: `bn.bn` and
+    `bn.split_bn`. `bn.split_bn` is used during training and
+    "compute_precise_bn". Before saving or evaluation, its stats are copied to
+    `bn.bn`. We rename `bn.bn` to `bn` and store it to be consistent with normal
+    BN layers.
+    Args:
+        sd (OrderedDict): a dict of parameters which might contain Sub-BN
+        parameters.
+    Returns:
+        new_sd (OrderedDict): a dict with Sub-BN parameters reshaped to
+        normal parameters.
+    """
+    modifications = [
+        ("bn.bn._mean", "bn._mean"),
+        ("bn.bn._variance", "bn._variance"),
+    ]
+    to_remove = ["bn.bn.", ".split_bn."]
+    key_list = list(sd.keys())  #odict_keys to list
+    for key in key_list:
+        for before, after in modifications:
+            if key.endswith(before):
+                new_key = key.split(before)[0] + after
+                sd[new_key] = sd.pop(key)
+
+        for rm in to_remove:
+            if rm in key and key in sd:
+                del sd[key]
+
+
+def normal_to_sub_bn(checkpoint_sd, model_sd):
+    """
+    When load, Convert BN parameters to Sub-BN parameters if model contains Sub-BNs.
+    Args:
+        checkpoint_sd (OrderedDict): source dict of parameters.
+        model_sd (OrderedDict): target dict of parameters.
+    Returns:
+        new_sd (OrderedDict): converted dict of parameters.
+    """
+    for key in model_sd:
+        if key not in checkpoint_sd:
+            # not to replace bn.weight and bn.bias
+            if "bn.split_bn." in key and "bn.weight" not in key and "bn.bias" not in key:
+                load_key = key.replace("bn.split_bn.", "bn.")
+                bn_key = key.replace("bn.split_bn.", "bn.bn.")
+                checkpoint_sd[key] = checkpoint_sd.pop(load_key)
+                checkpoint_sd[bn_key] = checkpoint_sd[key]
+
+    # match the shape of bn.split_bn._xx
+    # model_sd: split_bn.rm.shape = num_feature*num_split
+    # checkpoint_sd: split_bn.rm.shape = bn.rm.shape = num_feature
+    for key in model_sd:
+        if key in checkpoint_sd:
+            model_blob_shape = model_sd[key].shape  #bn.split_bn
+            c2_blob_shape = checkpoint_sd[key].shape  #bn.bn
+
+            if (len(model_blob_shape) == 1 and len(c2_blob_shape) == 1
+                    and model_blob_shape[0] > c2_blob_shape[0]
+                    and model_blob_shape[0] % c2_blob_shape[0] == 0):
+                before_shape = checkpoint_sd[key].shape
+                checkpoint_sd[key] = np.concatenate(
+                    [checkpoint_sd[key]] *
+                    (model_blob_shape[0] // c2_blob_shape[0]))
+                if 'split_bn' not in key:  #split_bn is excepted
+                    print("{} {} -> {}".format(key, before_shape,
+                                               checkpoint_sd[key].shape))
+    return checkpoint_sd
+
+
+def mapping_opt_dict(opt_dict, model_key_list):
+    """
+    Paddle Name schedule: conv_1.w -> conv_2.w
+    Sometimes: sub_bn -> bn
+    when re-build model, we desire the parameter name to be coincident,
+    but the parameters name index will be added, as conv_1 to conv_2, not conv_1.
+    It will raise error if we set old saved parameters to new created optimizer.
+    as conv_2 cannot find in state_dict(only conv_1).
+    Args:
+        opt_dict: optimizer state dict, including the name and value of parameters gradient.
+        model_key_list: the parameters name list of re-build model.
+    Return: optimizer state dict with modified keys
+    """
+    def get_name_info(PNAME, PN_key_list, key_list):
+        min_index = float('inf')
+        max_index = 0
+        for name in PN_key_list[1:]:
+            for key in key_list:
+                if name in key:
+                    index = int(key.split('.')[0].split(name)[-1])
+                    if index < min_index:
+                        min_index = index
+                    if index > max_index:
+                        max_index = index
+            num_name = max_index - min_index + 1
+            PNAME[name].append((min_index, max_index, num_name))
+            min_index = float('inf')
+            max_index = 0
+
+    PNAME = {
+        "LR_Scheduler": [],
+        "conv3d_": [],
+        "linear_": [],
+        "sub_batch_norm3d_": [],
+        "batch_norm3d_": [],
+    }
+
+    pd_key_list = list(opt_dict.keys())
+    print("The number of parameters in saved optimizer state dict = {}".format(
+        len(pd_key_list)))
+    print("The number of parameters in re-build model list = {}".format(
+        len(model_key_list)))
+    # 1 may be LR_Scheduler
+    PN_key_list = list(PNAME.keys())
+
+    # get the number of each PNAME
+    get_name_info(PNAME, PN_key_list, pd_key_list)
+    get_name_info(PNAME, PN_key_list, model_key_list)
+    print("[Parameters info] prefix: min_index, max_index, number_params: \n",
+          PNAME)
+
+    # whether to change name of bn layer
+    change_name = False
+    if PNAME["sub_batch_norm3d_"][0][-1] == -float('inf'):
+        PN_key_list.remove("sub_batch_norm3d_")
+        if PNAME["sub_batch_norm3d_"][1][-1] != -float('inf'):
+            print(
+                "Optimizer state dict saved bn, but Re-build model use sub_bn, changed name!"
+            )
+            change_name = True
+        else:
+            print("Optimizer state dict saved bn, and Re-build model use bn")
+    else:
+        PN_key_list.remove("batch_norm3d_")
+        if PNAME["sub_batch_norm3d_"][1][-1] == -float('inf'):
+            print(
+                "Optimizer state dict saved sub_bn, but Re-build model use bn, changed name!"
+            )
+            change_name = True
+        else:
+            print(
+                "Optimizer state dict saved sub_bn, Re-build model use sub_bn")
+
+    #update key name
+    # sub_bn -> bn name mapping, pre-define dict
+    change_dict = {
+        "sub_batch_norm3d_": "batch_norm3d_",
+        "batch_norm3d_": "sub_batch_norm3d_"
+    }
+    for key in pd_key_list:
+        for name in PN_key_list[1:]:
+            if key.startswith(name):
+                start = change_dict[name] if (
+                    change_name and "batch_norm" in name) else name
+                str_index = key.split('.')[0].split(name)[-1]
+                index = int(str_index)
+                new_index = str(index +
+                                (PNAME[start][1][0] - PNAME[name][0][0]))
+                end = key.split('.')[-1]
+                update_key = start + new_index + '.' + end
+                opt_dict[update_key] = opt_dict.pop(key)
+
+    return opt_dict
+
+
+def subn_save(save_dir, name_prefix, epoch, video_model, optimizer):
+    if not os.path.isdir(save_dir):
+        os.makedirs(save_dir)
+    model_path = os.path.join(save_dir, name_prefix + "{:05d}".format(epoch))
+    model_dict = video_model.state_dict()
+    sub_to_normal_bn(model_dict)
+    opti_dict = optimizer.state_dict()
+    paddle.save(model_dict, model_path + '.pdparams')
+    paddle.save(opti_dict, model_path + '.pdopt')
+    print('[Saved Epoch {} parameters and optimizer state ]'.format(epoch))
+
+
+def subn_load(model, ck_path, optimizer=None):
+    """
+    Load the checkpoint from the given file.
+    Args:
+        model (model): model to load the weights from the checkpoint.
+        optimizer (optim, optional): optimizer to load the historical state.
+        ck_path (str): checkpoint path
+    Returns:
+        (int): the number of training epoch of the checkpoint.
+    """
+
+    assert os.path.exists(ck_path + ".pdparams"), \
+        "Given dir {}.pdparams not exist.".format(ck_path)
+    print("load checkpint from {}.pdparams".format(ck_path))
+
+    model_dict = model.state_dict()
+    checkpoint_dict = paddle.load(ck_path + ".pdparams")
+    #    checkpoint_dict = copy.deepcopy(checkpoint_dict_orig)  #not modify when multi card
+    pre_train_dict = normal_to_sub_bn(checkpoint_dict, model_dict)
+
+    # Match pre-trained weights that have same shape as current model.
+    pre_train_dict_match = {
+        k: v
+        for k, v in pre_train_dict.items()
+        if k in model_dict and tuple(v.shape) == tuple(model_dict[k].shape)
+    }
+
+    # Weights that do not have match from the pre-trained model.
+    not_load_layers = [
+        k for k in model_dict.keys() if k not in pre_train_dict_match.keys()
+    ]
+    # Log weights that are not loaded with the pre-trained weights.
+    if not_load_layers:
+        for k in not_load_layers:
+            if 'bn.weight' not in k and 'bn.bias' not in k:
+                print("Network weights {} not loaded.".format(k))
+
+    # Load pre-trained weights.
+    model.set_state_dict(pre_train_dict_match)
+
+    if optimizer:
+        assert os.path.exists(ck_path + ".pdopt"), \
+            "Given dir {}.pdopt not exist.".format(ck_path)
+        print("load checkpint from {}.pdopt".format(ck_path))
+        opt_dict = paddle.load(ck_path + ".pdopt")
+        # get parameters that required gradient from re-build model
+        model_key_list = []
+        for param in model.parameters():
+            if param.stop_gradient == False:
+                model_key_list.append(param.name)
+
+        new_opt_dict = mapping_opt_dict(opt_dict, model_key_list)
+        optimizer.set_state_dict(new_opt_dict)
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/short_sampler.py b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/short_sampler.py
new file mode 100644
index 0000000..0004dac
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/short_sampler.py
@@ -0,0 +1,147 @@
+from __future__ import print_function
+from __future__ import division
+
+import numpy as np
+import math
+
+from paddle.io import BatchSampler
+
+__all__ = ["DistributedShortSampler"]
+
+
+class DistributedShortSampler(BatchSampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    In such case, each process can pass a DistributedBatchSampler instance
+    as a DataLoader sampler, and load a subset of the original dataset that
+    is exclusive to it.
+    .. note::
+        Batch size is dynamic changed following short cycle schedule.
+
+    Args:
+        dataset(paddle.io.Dataset): this could be a `paddle.io.Dataset` implement
+                     or other python object which implemented
+                     `__len__` for BatchSampler to get sample
+                     number of data source.
+        batch_sizes(list): batch size list of one cycle.
+        num_replicas(int, optional): porcess number in distributed training.
+            If :attr:`num_replicas` is None, :attr:`num_replicas` will be
+            retrieved from :code:`paddle.fluid.dygraph.parallel.ParallenEnv`.
+            Default None.
+        rank(int, optional): the rank of the current process among :attr:`num_replicas`
+            processes. If :attr:`rank` is None, :attr:`rank` is retrieved from
+            :code:`paddle.fluid.dygraph.parallel.ParallenEnv`. Default None.
+        shuffle(bool): whther to shuffle indices order before genrating
+            batch indices. Default False.
+        drop_last(bool): whether drop the last incomplete batch dataset size
+            is not divisible by the batch size. Default False
+    """
+    def __init__(self,
+                 dataset,
+                 batch_sizes,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=False,
+                 drop_last=False):
+        self.dataset = dataset
+
+        assert any(isinstance(batch_size, int) and batch_size > 0 for batch_size in batch_sizes), \
+            "batch_size should be a positive integer"
+        self.batch_sizes = batch_sizes
+        self.len_batch_sizes = len(self.batch_sizes)
+        assert isinstance(shuffle, bool), \
+            "shuffle should be a boolean value"
+        self.shuffle = shuffle
+        assert isinstance(drop_last, bool), \
+            "drop_last should be a boolean number"
+
+        from paddle.distributed import ParallelEnv
+
+        if num_replicas is not None:
+            assert isinstance(num_replicas, int) and num_replicas > 0, \
+                "num_replicas should be a positive integer"
+            self.nranks = num_replicas
+        else:
+            self.nranks = ParallelEnv().nranks
+
+        if rank is not None:
+            assert isinstance(rank, int) and rank >= 0, \
+                "rank should be a non-negative integer"
+            self.local_rank = rank
+        else:
+            self.local_rank = ParallelEnv().local_rank
+
+        self.drop_last = drop_last
+        self.epoch = 0
+        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.nranks))
+        self.total_size = self.num_samples * self.nranks
+
+    def __iter__(self):
+        num_samples = len(self.dataset)
+        indices = np.arange(num_samples).tolist()
+        indices += indices[:(self.total_size -
+                             len(indices))]  #completion last iter
+        assert len(indices) == self.total_size
+        if self.shuffle:
+            np.random.RandomState(self.epoch).shuffle(indices)
+            self.epoch += 1
+
+        # subsample
+        def _get_indices_by_batch_size(indices):
+            total_batch_size = sum(self.batch_sizes)
+            subsampled_indices = []
+            last_batch_size = self.total_size % (
+                total_batch_size * self.nranks)  #number samples of last batch
+            assert last_batch_size % self.nranks == 0
+            last_local_batch_size = last_batch_size // self.nranks
+
+            for i in range(self.local_rank * total_batch_size,
+                           len(indices) - last_batch_size,
+                           total_batch_size * self.nranks):
+                subsampled_indices.extend(indices[i:i + total_batch_size])
+
+            indices = indices[len(indices) - last_batch_size:]
+            subsampled_indices.extend(
+                indices[self.local_rank *
+                        last_local_batch_size:(self.local_rank + 1) *
+                        last_local_batch_size])
+            return subsampled_indices
+
+        if self.nranks > 1:
+            indices = _get_indices_by_batch_size(indices)
+
+        assert len(indices) == self.num_samples  #index length in each card
+        _sample_iter = iter(indices)
+
+        batch_indices = []
+        counter = 0
+        batch_size = self.batch_sizes[0]
+        for idx in _sample_iter:
+            batch_indices.append(
+                (idx, counter %
+                 self.len_batch_sizes))  #to be used in dataloader get_item
+            if len(batch_indices) == batch_size:
+                yield batch_indices
+                counter += 1
+                batch_size = self.batch_sizes[counter % self.len_batch_sizes]
+                batch_indices = []
+        if not self.drop_last and len(batch_indices) > 0:
+            yield batch_indices
+
+    def __len__(self):
+        avg_batch_size = sum(self.batch_sizes) / float(self.len_batch_sizes)
+        if self.drop_last:
+            return int(np.floor(self.num_samples / avg_batch_size))
+        else:
+            return int(np.ceil(self.num_samples / avg_batch_size))
+
+    def set_epoch(self, epoch):
+        """
+        Sets the epoch number. When :attr:`shuffle=True`, this number is used
+        as seeds of random numbers. By default, users may not set this, all
+        replicas (workers) use a different random ordering for each epoch.
+        If set same number at each epoch, this sampler will yield the same
+        ordering at all epoches.
+        Arguments:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/precise_bn.py b/Bank_second_part/detect_process/paddlevideo/utils/precise_bn.py
new file mode 100644
index 0000000..c9fdd40
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/utils/precise_bn.py
@@ -0,0 +1,94 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import itertools
+
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+"""
+Implement precise bn, which is useful for improving accuracy.
+"""
+
+
+@paddle.no_grad()  # speed up and save CUDA memory
+def do_preciseBN(model,
+                 data_loader,
+                 parallel,
+                 num_iters=200,
+                 use_amp=False,
+                 amp_level=None):
+    """
+    Recompute and update the batch norm stats to make them more precise. During
+    training both BN stats and the weight are changing after every iteration, so
+    the running average can not precisely reflect the actual stats of the
+    current model.
+    In this function, the BN stats are recomputed with fixed weights, to make
+    the running average more precise. Specifically, it computes the true average
+    of per-batch mean/variance instead of the running average.
+    This is useful to improve validation accuracy.
+    Args:
+        model: the model whose bn stats will be recomputed
+        data_loader: an iterator. Produce data as input to the model
+        num_iters: number of iterations to compute the stats.
+    Return:
+        the model with precise mean and variance in bn layers.
+    """
+    bn_layers_list = [
+        m for m in model.sublayers()
+        if any((isinstance(m, bn_type)
+                for bn_type in (paddle.nn.BatchNorm1D, paddle.nn.BatchNorm2D,
+                                paddle.nn.BatchNorm3D))) and m.training
+    ]
+    if len(bn_layers_list) == 0:
+        return
+
+    # moving_mean=moving_mean*momentum+batch_mean*(1.−momentum)
+    # we set momentum=0. to get the true mean and variance during forward
+    momentum_actual = [bn._momentum for bn in bn_layers_list]
+    for bn in bn_layers_list:
+        bn._momentum = 0.
+
+    running_mean = [paddle.zeros_like(bn._mean)
+                    for bn in bn_layers_list]  # pre-ignore
+    running_var = [paddle.zeros_like(bn._variance) for bn in bn_layers_list]
+
+    ind = -1
+    for ind, data in enumerate(itertools.islice(data_loader, num_iters)):
+        logger.info("Computing precise BN {} / {}...".format(
+            ind + 1, num_iters))
+
+        if use_amp:
+            with paddle.amp.auto_cast(
+                    custom_black_list={"reduce_mean",
+                                       "conv3d"}, level=amp_level):
+                model(data, mode='train')
+        else:
+            model(data, mode='train')
+
+        for i, bn in enumerate(bn_layers_list):
+            # Accumulates the bn stats.
+            running_mean[i] += (bn._mean - running_mean[i]) / (ind + 1)
+            running_var[i] += (bn._variance - running_var[i]) / (ind + 1)
+
+    assert ind == num_iters - 1, (
+        "update_bn_stats is meant to run for {} iterations, but the dataloader stops at {} iterations."
+        .format(num_iters, ind))
+
+    # Sets the precise bn stats.
+    for i, bn in enumerate(bn_layers_list):
+        bn._mean.set_value(running_mean[i])
+        bn._variance.set_value(running_var[i])
+        bn._momentum = momentum_actual[i]
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/profiler.py b/Bank_second_part/detect_process/paddlevideo/utils/profiler.py
new file mode 100644
index 0000000..629ef4e
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/utils/profiler.py
@@ -0,0 +1,128 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import paddle.profiler as profiler
+
+# A global variable to record the number of calling times for profiler
+# functions. It is used to specify the tracing range of training steps.
+_profiler_step_id = 0
+
+# A global variable to avoid parsing from string every time.
+_profiler_options = None
+_prof = None
+
+class ProfilerOptions(object):
+    '''
+    Use a string to initialize a ProfilerOptions.
+    The string should be in the format: "key1=value1;key2=value;key3=value3".
+    For example:
+      "profile_path=model.profile"
+      "batch_range=[50, 60]; profile_path=model.profile"
+      "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
+
+    ProfilerOptions supports following key-value pair:
+      batch_range      - a integer list, e.g. [100, 110].
+      state            - a string, the optional values are 'CPU', 'GPU' or 'All'. 
+      sorted_key       - a string, the optional values are 'calls', 'total',
+                         'max', 'min' or 'ave.
+      tracer_option    - a string, the optional values are 'Default', 'OpDetail',
+                         'AllOpDetail'.
+      profile_path     - a string, the path to save the serialized profile data,
+                         which can be used to generate a timeline.
+      exit_on_finished - a boolean.
+    '''
+
+    def __init__(self, options_str):
+        assert isinstance(options_str, str)
+
+        self._options = {
+            'batch_range': [10, 20],
+            'state': 'All',
+            'sorted_key': 'total',
+            'tracer_option': 'Default',
+            'profile_path': '/tmp/profile',
+            'exit_on_finished': True,
+            'timer_only': True
+        }
+        self._parse_from_string(options_str)
+
+    def _parse_from_string(self, options_str):
+        for kv in options_str.replace(' ', '').split(';'):
+            key, value = kv.split('=')
+            if key == 'batch_range':
+                value_list = value.replace('[', '').replace(']', '').split(',')
+                value_list = list(map(int, value_list))
+                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
+                        1] > value_list[0]:
+                    self._options[key] = value_list
+            elif key == 'exit_on_finished':
+                self._options[key] = value.lower() in ("yes", "true", "t", "1")
+            elif key in [
+                    'state', 'sorted_key', 'tracer_option', 'profile_path'
+            ]:
+                self._options[key] = value
+            elif key == 'timer_only':
+                self._options[key] = value
+
+    def __getitem__(self, name):
+        if self._options.get(name, None) is None:
+            raise ValueError(
+                "ProfilerOptions does not have an option named %s." % name)
+        return self._options[name]
+
+
+def add_profiler_step(options_str=None):
+    '''
+    Enable the operator-level timing using PaddlePaddle's profiler.
+    The profiler uses a independent variable to count the profiler steps.
+    One call of this function is treated as a profiler step.
+    Args:
+      profiler_options - a string to initialize the ProfilerOptions.
+                         Default is None, and the profiler is disabled.
+    '''
+    if options_str is None:
+        return
+
+    global _prof 
+    global _profiler_step_id
+    global _profiler_options
+
+    if _profiler_options is None:
+        _profiler_options = ProfilerOptions(options_str)
+    # profile : https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/profiling_model.html#chakanxingnengshujudetongjibiaodan
+    # timer_only = True  only the model's throughput and time overhead are displayed
+    # timer_only = False calling summary can print a statistical form that presents performance data from different perspectives.
+    # timer_only = False the output Timeline information can be found in the profiler_log directory
+    if _prof is None:
+        _timer_only = str(_profiler_options['timer_only']) == str(True)
+        _prof = profiler.Profiler(
+                   scheduler = (_profiler_options['batch_range'][0], _profiler_options['batch_range'][1]),
+                   on_trace_ready = profiler.export_chrome_tracing('./profiler_log'),
+                   timer_only = _timer_only)
+        _prof.start()
+    else:
+        _prof.step()
+        
+    if _profiler_step_id == _profiler_options['batch_range'][1]:
+        _prof.stop()
+        _prof.summary(
+             op_detail=True,
+             thread_sep=False,
+             time_unit='ms')
+        _prof = None
+        if _profiler_options['exit_on_finished']:
+            sys.exit(0)
+
+    _profiler_step_id += 1
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/record.py b/Bank_second_part/detect_process/paddlevideo/utils/record.py
new file mode 100644
index 0000000..4aad434
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/utils/record.py
@@ -0,0 +1,163 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+from collections import OrderedDict
+
+import paddle
+
+from .logger import coloring, get_logger
+
+logger = get_logger("paddlevideo")
+
+__all__ = ['AverageMeter', 'build_record', 'log_batch', 'log_epoch']
+
+
+def build_record(cfg):
+    record_list = [
+        ("loss", AverageMeter('loss', '7.5f')),
+        ("lr", AverageMeter('lr', 'f', need_avg=False)),
+    ]
+    if 'Recognizer1D' in cfg.framework:  #TODO: required specify str in framework
+        record_list.append(("hit_at_one", AverageMeter("hit_at_one", '.5f')))
+        record_list.append(("perr", AverageMeter("perr", '.5f')))
+        record_list.append(("gap", AverageMeter("gap", '.5f')))
+    elif 'Recognizer' in cfg.framework:
+        record_list.append(("top1", AverageMeter("top1", '.5f')))
+        record_list.append(("top5", AverageMeter("top5", '.5f')))
+    elif 'FastRCNN' in cfg.framework:
+        record_list.append(
+            ("recall@thr=0.5", AverageMeter("recall@thr=0.5", '.5f')))
+        record_list.append(("prec@thr=0.5", AverageMeter("prec@thr=0.5",
+                                                         '.5f')))
+        record_list.append(("recall@top3", AverageMeter("recall@top3", '.5f')))
+        record_list.append(("prec@top3", AverageMeter("prec@top3", '.5f')))
+        record_list.append(("recall@top5", AverageMeter("recall@top5", '.5f')))
+        record_list.append(("prec@top5", AverageMeter("prec@top5", '.5f')))
+        record_list.append(("mAP@0.5IOU", AverageMeter("mAP@0.5IOU", '.5f')))
+    elif 'DepthEstimator' in cfg.framework:
+        record_list.append(("abs_rel", AverageMeter("abs_rel", '.5f')))
+        record_list.append(("sq_rel", AverageMeter("sq_rel", '.5f')))
+        record_list.append(("rmse", AverageMeter("rmse", '.5f')))
+        record_list.append(("rmse_log", AverageMeter("rmse_log", '.5f')))
+        record_list.append(("a1", AverageMeter("a1", '.5f')))
+        record_list.append(("a2", AverageMeter("a2", '.5f')))
+        record_list.append(("a3", AverageMeter("a3", '.5f')))
+        record_list.append(("losses_day", AverageMeter("losses_day", '.5f')))
+        record_list.append(("losses_night", AverageMeter("losses_night",
+                                                         '.5f')))
+    elif 'MSTCN' in cfg.framework or 'ASRF' in cfg.framework:
+        record_list.append(("F1@0.50", AverageMeter("F1@0.50", '.5f')))
+
+    elif 'YOWOLocalizer' in cfg.framework:
+        record_list.append(("nCorrect", AverageMeter('nCorrect', '.1f')))
+        record_list.append(("fscore", AverageMeter("fscore", '.5f')))
+
+    record_list.append(("batch_time", AverageMeter('batch_cost', '.5f')))
+    record_list.append(("reader_time", AverageMeter('reader_cost', '.5f')))
+    record_list = OrderedDict(record_list)
+    return record_list
+
+
+class AverageMeter(object):
+    """
+    Computes and stores the average and current value
+    """
+    def __init__(self, name='', fmt='f', need_avg=True):
+        self.name = name
+        self.fmt = fmt
+        self.need_avg = need_avg
+        self.reset()
+
+    def reset(self):
+        """ reset """
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        """ update """
+        if isinstance(val, paddle.Tensor):
+            val = float(val)
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    @property
+    def total(self):
+        return '{self.name}_sum: {self.sum:{self.fmt}}'.format(self=self)
+
+    @property
+    def total_minute(self):
+        return '{self.name}_sum: {s:{self.fmt}} min'.format(s=self.sum / 60,
+                                                            self=self)
+
+    @property
+    def mean(self):
+        return '{self.name}_avg: {self.avg:{self.fmt}}'.format(
+            self=self) if self.need_avg else ''
+
+    @property
+    def value(self):
+        return '{self.name}: {self.val:{self.fmt}}'.format(self=self)
+
+
+def log_batch(metric_list,
+              batch_id,
+              epoch_id,
+              total_epoch,
+              mode,
+              ips,
+              eta_sec: int = None):
+    batch_cost = str(metric_list['batch_time'].value) + ' sec,'
+    reader_cost = str(metric_list['reader_time'].value) + ' sec,'
+
+    metric_values = []
+    for m in metric_list:
+        if not (m == 'batch_time' or m == 'reader_time'):
+            metric_values.append(metric_list[m].value)
+    metric_str = ' '.join([str(v) for v in metric_values])
+    epoch_str = "epoch:[{:>3d}/{:<3d}]".format(epoch_id, total_epoch)
+    step_str = "{:s} step:{:<4d}".format(mode, batch_id)
+    if eta_sec is not None:
+        eta_str = "eta: {:s}".format(
+            str(datetime.timedelta(seconds=int(eta_sec))))
+    else:
+        eta_str = ''
+    logger.info("{:s} {:s} {:s} {:s} {:s} {} {:s}".format(
+        coloring(epoch_str, "HEADER") if batch_id == 0 else epoch_str,
+        coloring(step_str, "PURPLE"), coloring(metric_str, 'OKGREEN'),
+        coloring(batch_cost, "OKGREEN"), coloring(reader_cost, 'OKGREEN'), ips,
+        eta_str))
+
+
+def log_epoch(metric_list, epoch, mode, ips):
+    batch_cost = 'avg_' + str(metric_list['batch_time'].value) + ' sec,'
+    reader_cost = 'avg_' + str(metric_list['reader_time'].value) + ' sec,'
+    batch_sum = str(metric_list['batch_time'].total) + ' sec,'
+
+    metric_values = []
+    for m in metric_list:
+        if not (m == 'batch_time' or m == 'reader_time'):
+            metric_values.append(metric_list[m].mean)
+    metric_str = ' '.join([str(v) for v in metric_values])
+
+    end_epoch_str = "END epoch:{:<3d}".format(epoch)
+
+    logger.info("{:s} {:s} {:s} {:s} {:s} {:s} {}".format(
+        coloring(end_epoch_str, "RED"), coloring(mode, "PURPLE"),
+        coloring(metric_str, "OKGREEN"), coloring(batch_cost, "OKGREEN"),
+        coloring(reader_cost, "OKGREEN"), coloring(batch_sum, "OKGREEN"), ips))
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/registry.py b/Bank_second_part/detect_process/paddlevideo/utils/registry.py
new file mode 100644
index 0000000..81b76bd
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/utils/registry.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Registry(object):
+    """
+    The registry that provides name -> object mapping, to support third-party users' custom modules.
+
+    To register an object:
+
+    .. code-block:: python
+
+        BACKBONES = Registry('backbone')
+        @BACKBONES.register()
+        class ResNet:
+            pass
+    Or:
+    .. code-block:: python
+
+        BACKBONES = Registry('backbone')
+        class ResNet:
+            pass
+        BACKBONES.register(ResNet)
+
+    Usage: To build a module.
+
+    .. code-block:: python
+        backbone_name = "ResNet"
+        b = BACKBONES.get(backbone_name)()
+
+    """
+    def __init__(self, name):
+        """
+        Args:
+            name (str): the name of this registry
+        """
+        self._name = name
+        self._obj_map = {}
+
+    def __contains__(self, key):
+        return self._obj_map.get(key) is not None
+
+    def _do_register(self, name, obj):
+        assert (
+            name not in self._obj_map
+        ), "An object named '{}' was already registered in '{}' registry!".format(
+            name, self._name)
+        self._obj_map[name] = obj
+
+    def register(self, obj=None, name=None):
+        """
+        Register the given object under the the name `obj.__name__`.
+        Can be used as either a decorator or not. See docstring of this class for usage.
+        """
+        if obj is None:
+            # used as a decorator
+            def deco(func_or_class, name=name):
+                if name is None:
+                    name = func_or_class.__name__
+                self._do_register(name, func_or_class)
+                return func_or_class
+
+            return deco
+
+        # used as a function call
+        if name is None:
+            name = obj.__name__
+        self._do_register(name, obj)
+
+    def get(self, name):
+        """Get the registry record.
+
+        Args:
+            name (str): The class name.
+
+        Returns:
+            ret: The class.
+        """
+        ret = self._obj_map.get(name)
+        if ret is None:
+            raise KeyError(
+                "No object named '{}' found in '{}' registry!".format(
+                    name, self._name))
+
+        return ret
diff --git a/Bank_second_part/detect_process/paddlevideo/utils/save_load.py b/Bank_second_part/detect_process/paddlevideo/utils/save_load.py
new file mode 100644
index 0000000..10bb5f0
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/utils/save_load.py
@@ -0,0 +1,289 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import os.path as osp
+import time
+
+import paddle
+import paddle.nn.functional as F
+from paddlevideo.utils import get_logger, main_only
+from tqdm import tqdm
+import numpy as np
+from scipy import ndimage
+
+
+def pretrain_swin_param_trans(model, state_dicts):
+    # delete classifier's params
+    if 'head.fc' + '.weight' in state_dicts:
+        del state_dicts['head.fc' + '.weight']
+    if 'head.fc' + '.bias' in state_dicts:
+        del state_dicts['head.fc' + '.bias']
+
+    state_dicts = {
+        k.replace('backbone.', ''): v
+        for k, v in state_dicts.items()
+    }
+
+    if len(state_dicts) == len(model.state_dict()):
+        print("Load 3D weights")
+        return state_dicts
+
+    print("Load 2D weights")
+    relative_position_index_keys = [
+        k for k in state_dicts.keys() if "relative_position_index" in k
+    ]
+    for k in relative_position_index_keys:
+        del state_dicts[k]
+
+    # delete attn_mask since we always re-init it
+    attn_mask_keys = [k for k in state_dicts.keys() if "attn_mask" in k]
+    for k in attn_mask_keys:
+        del state_dicts[k]
+
+    state_dicts['patch_embed.proj.weight'] = state_dicts[
+        'patch_embed.proj.weight'].unsqueeze(2).tile(
+            [1, 1, model.patch_size[0], 1, 1]) / model.patch_size[0]
+
+    # bicubic interpolate relative_position_bias_table if not match
+    relative_position_bias_table_keys = [
+        k for k in state_dicts.keys() if "relative_position_bias_table" in k
+    ]
+    total_len = len(relative_position_bias_table_keys)
+    with tqdm(total=total_len,
+              position=1,
+              bar_format='{desc}',
+              desc="Loading weights") as desc:
+        for key in tqdm(relative_position_bias_table_keys,
+                        total=total_len,
+                        position=0):
+            relative_position_bias_table_pretrained = state_dicts[key]
+            relative_position_bias_table_current = model.state_dict()[key]
+            L1, nH1 = relative_position_bias_table_pretrained.shape
+            L2, nH2 = relative_position_bias_table_current.shape
+            L2 = (2 * model.window_size[1] - 1) * (2 * model.window_size[2] - 1)
+            wd = model.window_size[0]
+            if nH1 != nH2:
+                desc.set_description(f"Error in loading {key}, skip")
+            else:
+                if L1 != L2:
+                    S1 = int(L1**0.5)
+                    relative_position_bias_table_pretrained_resized = paddle.nn.functional.interpolate(
+                        relative_position_bias_table_pretrained.transpose(
+                            [1, 0]).reshape([1, nH1, S1, S1]),
+                        size=(2 * model.window_size[1] - 1,
+                              2 * model.window_size[2] - 1),
+                        mode='bicubic')
+                    relative_position_bias_table_pretrained = relative_position_bias_table_pretrained_resized.reshape(
+                        [nH2, L2]).transpose([1, 0])
+                desc.set_description(f"Loading {key}")
+            state_dicts[key] = relative_position_bias_table_pretrained.tile(
+                [2 * wd - 1, 1])
+            time.sleep(0.01)
+    ret_str = "loading {:<20d} weights completed.".format(
+        len(model.state_dict()))
+    desc.set_description(ret_str)
+    return state_dicts
+
+
+def pretrain_vit_param_trans(model, state_dicts, num_patches, num_seg,
+                             attention_type):
+    """
+    Convert ViT's pre-trained model parameters to a parameter dictionary that matches the existing model
+    """
+    if 'head' + '.weight' in state_dicts:
+        del state_dicts['head' + '.weight']
+    if 'head' + '.bias' in state_dicts:
+        del state_dicts['head' + '.bias']
+
+    total_len = len(model.state_dict())
+    if num_patches + 1 != state_dicts['pos_embed'].shape[1]:  # when
+        pos_embed = state_dicts['pos_embed']
+        cls_pos_embed = paddle.to_tensor(
+            pos_embed[0, 0, :]).unsqueeze(0).unsqueeze(1)
+        other_pos_embed = paddle.to_tensor(pos_embed[0, 1:, :])
+        gs_new = int(np.sqrt(num_patches))
+        gs_old = int(np.sqrt(other_pos_embed.shape[0]))
+        zoom = (gs_new / gs_old, gs_new / gs_old, 1)
+        other_pos_embed = paddle.reshape(other_pos_embed, [gs_old, gs_old, -1])
+        other_pos_embed = ndimage.zoom(other_pos_embed, zoom, order=1)
+        other_pos_embed = paddle.to_tensor(other_pos_embed)
+        new_pos_embed = paddle.reshape(other_pos_embed, [1, num_patches, -1])
+        new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed), axis=1)
+        state_dicts['pos_embed'] = new_pos_embed
+        time.sleep(0.01)
+
+    if 'time_embed' in state_dicts and num_seg != state_dicts[
+            'time_embed'].shape[1]:
+        time_embed = state_dicts['time_embed'].transpose((0, 2, 1)).unsqueeze(0)
+        new_time_embed = F.interpolate(time_embed,
+                                       size=(time_embed.shape[-2], num_seg),
+                                       mode='nearest')
+        state_dicts['time_embed'] = new_time_embed.squeeze(0).transpose(
+            (0, 2, 1))
+        time.sleep(0.01)
+    with tqdm(total=total_len,
+              position=1,
+              bar_format='{desc}',
+              desc="Loading weights") as desc:
+        if attention_type == 'divided_space_time':
+            new_state_dicts = state_dicts.copy()
+            for key in tqdm(state_dicts):
+                if 'blocks' in key and 'attn' in key:
+                    desc.set_description("Loading %s" % key)
+                    new_key = key.replace('attn', 'temporal_attn')
+                    if not new_key in state_dicts:
+                        new_state_dicts[new_key] = state_dicts[key]
+                    else:
+                        new_state_dicts[new_key] = state_dicts[new_key]
+                if 'blocks' in key and 'norm1' in key:
+                    desc.set_description("Loading %s" % key)
+                    new_key = key.replace('norm1', 'temporal_norm1')
+                    if not new_key in state_dicts:
+                        new_state_dicts[new_key] = state_dicts[key]
+                    else:
+                        new_state_dicts[new_key] = state_dicts[new_key]
+                time.sleep(0.01)
+        elif attention_type == 'space_only':  # tokenshift raw vit
+            new_state_dicts = state_dicts.copy()
+
+    ret_str = "loading {:<20d} weights completed.".format(
+        len(model.state_dict()))
+    desc.set_description(ret_str)
+    return new_state_dicts
+
+
+def pretrain_resnet18_param_trans(model, loaded_dict):
+    encoder_dict = model.encoder.state_dict()
+    pose_encoder_dict = model.pose_encoder.state_dict()
+
+    names = ['encoder.', 'encoder_day.', 'encoder_night.']
+    for name in names:
+        total_len = len(loaded_dict.items())
+        with tqdm(total=total_len,
+                  position=1,
+                  bar_format='{desc}',
+                  desc="Loading weights") as desc:
+            for key, value in tqdm(loaded_dict.items(),
+                                   total=total_len,
+                                   position=0):
+                key = str(name + key)
+                if key in encoder_dict:
+                    encoder_dict[key] = value
+                    desc.set_description('Loading %s' % key)
+                time.sleep(0.01)
+
+    num_input_images = 2
+    loaded_dict['conv1.weight'] = paddle.concat(
+        [loaded_dict['conv1.weight']] * num_input_images, 1) / num_input_images
+    total_len = len(loaded_dict.items())
+    with tqdm(total=total_len,
+              position=1,
+              bar_format='{desc}',
+              desc="Loading weights") as desc:
+        for name, value in tqdm(loaded_dict.items(),
+                                total=total_len,
+                                position=0):
+            name = str('encoder.' + name)
+            if name in pose_encoder_dict:
+                pose_encoder_dict[name] = value
+                desc.set_description('Loading %s' % key)
+            time.sleep(0.01)
+        ret_str = "loading {:<20d} weights completed.".format(
+            len(model.state_dict()))
+        desc.set_description(ret_str)
+    return encoder_dict, pose_encoder_dict
+
+
+#XXX(shipping): maybe need load N times because of different cards have different params.
+@main_only
+def load_ckpt(model, weight_path, **kargs):
+    """
+    1. Load pre-trained model parameters
+    2. Extract and convert from the pre-trained model to the parameters
+    required by the existing model
+    3. Load the converted parameters of the existing model
+    """
+    #model.set_state_dict(state_dict)
+
+    if not osp.isfile(weight_path):
+        raise IOError(f'{weight_path} is not a checkpoint file')
+    #state_dicts = load(weight_path)
+
+    logger = get_logger("paddlevideo")
+    state_dicts = paddle.load(weight_path)
+    if 'ResnetEncoder' in str(model):
+        encoder_dict, pose_encoder_dict = pretrain_resnet18_param_trans(
+            model, state_dicts)
+        model.encoder.load_dict(encoder_dict)
+        model.pose_encoder.load_dict(pose_encoder_dict)
+        tmp = model.state_dict()
+    elif "VisionTransformer" in str(model):  # For TimeSformer case
+        tmp = pretrain_vit_param_trans(model, state_dicts, kargs['num_patches'],
+                                       kargs['num_seg'],
+                                       kargs['attention_type'])
+    elif 'SwinTransformer3D' in str(model):
+        tmp = pretrain_swin_param_trans(model, state_dicts)
+    else:
+        tmp = {}
+        total_len = len(model.state_dict())
+        with tqdm(total=total_len,
+                  position=1,
+                  bar_format='{desc}',
+                  desc="Loading weights") as desc:
+            for item in tqdm(model.state_dict(), total=total_len, position=0):
+                name = item
+                desc.set_description('Loading %s' % name)
+                if name not in state_dicts:  # Convert from non-parallel model
+                    if str('backbone.' + name) in state_dicts:
+                        tmp[name] = state_dicts['backbone.' + name]
+                else:  # Convert from parallel model
+                    tmp[name] = state_dicts[name]
+                time.sleep(0.01)
+        ret_str = "loading {:<20d} weights completed.".format(
+            len(model.state_dict()))
+        desc.set_description(ret_str)
+    model.set_state_dict(tmp)
+
+
+def mkdir(dir):
+    if not os.path.exists(dir):
+        # avoid error when train with multiple gpus
+        try:
+            os.makedirs(dir)
+        except:
+            pass
+
+
+def _extract_student_weights(all_params, student_prefix="Student."):
+    s_params = {
+        key[len(student_prefix):]: all_params[key]
+        for key in all_params if student_prefix in key
+    }
+    return s_params
+
+
+@main_only
+def save(obj, path, save_student_model=False):
+    if save_student_model:
+        s_params = _extract_student_weights(obj)
+        student_path = path.replace(".pdparams", "_student.pdparams")
+        if len(s_params) > 0:
+            paddle.save(s_params, student_path)
+    paddle.save(obj, path)
+
+
+def load(file_name):
+    if not osp.isfile(file_name):
+        raise IOError(f'{file_name} not exist')
+    return paddle.load(file_name)
diff --git a/Bank_second_part/detect_process/paddlevideo/version.py b/Bank_second_part/detect_process/paddlevideo/version.py
new file mode 100644
index 0000000..b5b7f48
--- /dev/null
+++ b/Bank_second_part/detect_process/paddlevideo/version.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ["paddlevideo_version"]
+paddlevideo_version = "0.0.1"
diff --git a/Bank_second_part/detect_process/personDet.py b/Bank_second_part/detect_process/personDet.py
new file mode 100644
index 0000000..0cef736
--- /dev/null
+++ b/Bank_second_part/detect_process/personDet.py
@@ -0,0 +1,42 @@
+
+def analysis_yolov8(frame, model_coco,confidence_set):
+
+    # 第一步：用COCO数据集推理
+    results_coco = model_coco(frame)
+
+    re_list = []
+    
+    if results_coco:
+
+        for r in results_coco:
+
+            boxes = r.boxes
+            
+            idx = 0
+
+            for box in boxes:
+
+                idx += 1
+                b = box.xyxy[0]  # get box coordinates in (top, left, bottom, right) format
+                c = box.cls
+
+                # 保存标签和坐标值作为返回结果
+                blist = b.tolist()
+                labels_name = model_coco.names[int(c)]
+    
+                confidence = float(box.conf)
+                        
+                confidence = round(confidence, 2)
+
+                # 过滤置信度以下目标
+                if confidence < confidence_set:
+
+                    continue
+                
+                if labels_name == 'person':
+                # 一个结果字典
+                    re_dict = {labels_name:blist}
+
+                    re_list.append(re_dict)
+
+    return re_list
diff --git a/Bank_second_part/detect_process/tools.py b/Bank_second_part/detect_process/tools.py
deleted file mode 100644
index 8d9af60..0000000
--- a/Bank_second_part/detect_process/tools.py
+++ /dev/null
@@ -1,212 +0,0 @@
-import cv2
-import os
-
-class Process_tools():
-
-    # 图像文件夹
-    def get_video_list(path):
-        video_ext = [".mp4", ".avi",".MP4"]
-        video_names = []
-        for maindir, subdir, file_name_list in os.walk(path):
-            for filename in file_name_list:
-                apath = os.path.join(maindir, filename)
-                ext = os.path.splitext(apath)[1]
-                if ext in video_ext:
-                    video_names.append(apath)
-        return video_names
-
-
-    # 截取裁剪需要的视频帧
-    def save_seg_video(video_name,frameToStart,frametoStop,videoWriter,bbox):
-
-        cap = cv2.VideoCapture(video_name)
-        count = 0
-        while True:
-
-            success, frame = cap.read()
-
-            if success:
-
-                count += 1
-                if count <= frametoStop and count > frameToStart:  # 选取起始帧
-                    print('correct= ', count)
-                    
-                    #裁剪视频画面
-                    frame_target = frame[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])]  # (split_height, split_width)
-
-                    videoWriter.write(frame_target)
-                    
-            if not success or count >= frametoStop:
-                break
-
-            print('end')  
-
-
-    # 获得字典中所有values值（这个值是列表）
-    def get_dict_values(lst):
-        """
-        获取列表中所有字典的 values 值（如果值是列表）
-        
-        参数:
-        lst: 包含字典的列表
-        
-        返回值:
-        values: 包含所有字典的 values 值的列表（如果值是列表）
-        """
-        return [value for dictionary in lst for value in dictionary.values() if isinstance(value, list)]
-
-
-
-    # 解析检测后的结果，为检测后的结果排序
-    def analysis_sort_list(result_dict):
-
-        # print('result_dict:',result_dict)
-
-        # 获得检测列表
-        re_list = result_dict['start_bbox']
-        # print('re_list:',re_list)
-
-        # 获得列表中所有字典的values值
-        re_bbox_list = Process_tools.get_dict_values(re_list)
-
-        # 为检测出来的标注框排序
-        sorted_lst = sorted(re_bbox_list, key=lambda x: x[0])
-
-        return sorted_lst
-
-
-    #对比重叠率高的两个部分，并结合标注框，保存最大的标注框
-    def contrast_bbox(e_bbox,r_bbox):
-
-        e_bbox_min = e_bbox[:2]
-        r_bbox_min = r_bbox[:2]
-
-        bbox_min = [min(x, y) for x, y in zip(e_bbox_min, r_bbox_min)]
-
-        e_bbox_max = e_bbox[-2:]
-        r_bbox_max = r_bbox[-2:]
-
-        bbox_max = [max(x, y) for x, y in zip(e_bbox_max, r_bbox_max)]
-
-        bbox = bbox_min + bbox_max
-
-        return bbox
-
-
-
-    # 解析result_list列表
-    def analysis_re01_list(example_dict,result_dict):
-
-        # 第一次检测到目标的帧率和信息
-        example_dict_fps = list(example_dict.keys())[0]
-        example_sorted_lst = Process_tools.analysis_sort_list(example_dict)
-
-        # 当前帧检测结果中所有的检测结果数值
-        re_dict_fps = list(result_dict.keys())[0]
-        re_dict_sorted_lst = Process_tools.analysis_sort_list(result_dict)
-
-        # 保存前后帧率连续的范围、筛选出相同的部分
-        cut_list = []
-        example_temp = []
-        re_temp = []
-
-        for i,ex_bbox in enumerate(example_sorted_lst):
-
-            for j,re_bbox in enumerate(re_dict_sorted_lst):
-
-                iou = Process_tools.calculate_iou(box1=ex_bbox, box2=re_bbox)
-
-                # print(iou)
-                
-                if iou > 0:
-
-                    bbox = Process_tools.contrast_bbox(e_bbox=ex_bbox,r_bbox=re_bbox)
-
-                    cut_list.append({i:bbox})
-                    example_temp.append(ex_bbox)
-                    re_temp.append(re_bbox)
-
-                    break
-
-                else:
-                    continue
-
-        example_sorted_lst = [item for item in example_sorted_lst if item not in example_temp]
-        re_dict_sorted_lst = [item for item in re_dict_sorted_lst if item not in re_temp]
-
-        return cut_list,example_sorted_lst,re_dict_sorted_lst
-
-
-    # 计算前后帧率重叠范围
-    def calculate_iou(box1, box2):
-        """
-        计算两个边界框之间的IoU值
-        
-        参数:
-        box1: 边界框1的坐标（x1, y1, x2, y2）
-        box2: 边界框2的坐标（x1, y1, x2, y2）
-        
-        返回值:
-        iou: 两个边界框之间的IoU值
-        """
-        x1 = max(box1[0], box2[0])
-        y1 = max(box1[1], box2[1])
-        x2 = min(box1[2], box2[2])
-        y2 = min(box1[3], box2[3])
-        
-        # 计算交集区域面积
-        intersection_area = max(0, x2 - x1 + 1) * max(0, y2 - y1 + 1)
-        
-        # 计算边界框1和边界框2的面积
-        box1_area = (box1[2] - box1[0] + 1) * (box1[3] - box1[1] + 1)
-        box2_area = (box2[2] - box2[0] + 1) * (box2[3] - box2[1] + 1)
-        
-        # 计算并集区域面积
-        union_area = box1_area + box2_area - intersection_area
-        
-        # 计算IoU值
-        iou = intersection_area / union_area
-        
-        return iou
-
-    def para_correction(images_size,bbox,dertpara):
-
-        '''
-        修正检测后标注框过小的情况,如果有修正参数则使用修正参数，如果没有就按照坐标值扩大两倍
-
-        '''
-        
-        if dertpara:
-            pass
-        else:
-            w = (bbox[2] - bbox[0]) /2
-            h = (bbox[3] - bbox[1]) /2
-
-            bbox_extand_list_x = [bbox[0] - w,bbox[2] + w]
-            bbox_extand_list_y = [bbox[1] - h,bbox[3] + h]
-            
-            bbox_list_x = Process_tools.contrast(size=images_size[0],bbox_extand_list=bbox_extand_list_x)
-            bbox_list_y = Process_tools.contrast(size=images_size[1],bbox_extand_list=bbox_extand_list_y)
-
-            bbox_list = bbox_list_x + bbox_list_y
-
-            return bbox_list
-        
-
-    def contrast(size,bbox_extand_list):
-
-        '''
-        对比数值是否在这个范围内
-        '''
-        
-        bbox_list = []
-
-        for x in bbox_extand_list:
-            
-            if x in range(size):
-                bbox_list.append(x)
-            if x > size:
-                bbox_list.append(size)
-            if x < 0:
-                bbox_list.append(0)
-        return bbox_list
\ No newline at end of file
diff --git a/Bank_second_part/detect_process/tools/__init__.py b/Bank_second_part/detect_process/tools/__init__.py
new file mode 100644
index 0000000..e8d173d
--- /dev/null
+++ b/Bank_second_part/detect_process/tools/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['utils', 'PaddleVideo', 'ava_predict']
+
+from . import utils
+from .wheel import PaddleVideo
+from . import ava_predict
diff --git a/Bank_second_part/detect_process/tools/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/tools/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..3988193
Binary files /dev/null and b/Bank_second_part/detect_process/tools/__pycache__/__init__.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/tools/__pycache__/ava_predict.cpython-310.pyc b/Bank_second_part/detect_process/tools/__pycache__/ava_predict.cpython-310.pyc
new file mode 100644
index 0000000..c133bea
Binary files /dev/null and b/Bank_second_part/detect_process/tools/__pycache__/ava_predict.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/tools/__pycache__/utils.cpython-310.pyc b/Bank_second_part/detect_process/tools/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000..b17058f
Binary files /dev/null and b/Bank_second_part/detect_process/tools/__pycache__/utils.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/tools/__pycache__/wheel.cpython-310.pyc b/Bank_second_part/detect_process/tools/__pycache__/wheel.cpython-310.pyc
new file mode 100644
index 0000000..55c3bf9
Binary files /dev/null and b/Bank_second_part/detect_process/tools/__pycache__/wheel.cpython-310.pyc differ
diff --git a/Bank_second_part/detect_process/tools/ava_predict.py b/Bank_second_part/detect_process/tools/ava_predict.py
new file mode 100644
index 0000000..5d333a2
--- /dev/null
+++ b/Bank_second_part/detect_process/tools/ava_predict.py
@@ -0,0 +1,509 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import paddle
+import os, sys
+import copy as cp
+import cv2
+import math
+try:
+    import ppdet
+except ImportError as e:
+    print(
+        f"Warning! {e}, [paddledet] package and it's dependencies is required for AVA."
+    )
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
+
+from paddlevideo.modeling.builder import build_model
+from paddlevideo.utils import get_config
+from paddlevideo.loader.builder import build_dataloader, build_dataset, build_pipeline
+from paddlevideo.metrics.ava_utils import read_labelmap
+
+import time
+from os import path as osp
+import numpy as np
+from paddlevideo.utils import get_config
+import pickle
+
+from paddlevideo.utils import (get_logger, load, mkdir, save)
+import shutil
+
+FONTFACE = cv2.FONT_HERSHEY_DUPLEX
+FONTSCALE = 0.5
+FONTCOLOR = (255, 255, 255)  # BGR, white
+MSGCOLOR = (128, 128, 128)  # BGR, gray
+THICKNESS = 1
+LINETYPE = 1
+
+
+def hex2color(h):
+    """Convert the 6-digit hex string to tuple of 3 int value (RGB)"""
+    return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))
+
+
+plate_blue = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'
+plate_blue = plate_blue.split('-')
+plate_blue = [hex2color(h) for h in plate_blue]
+plate_green = '004b23-006400-007200-008000-38b000-70e000'
+plate_green = plate_green.split('-')
+plate_green = [hex2color(h) for h in plate_green]
+
+
+def abbrev(name):
+    """Get the abbreviation of label name:
+    'take (an object) from (a person)' -> 'take ... from ...'
+    """
+    while name.find('(') != -1:
+        st, ed = name.find('('), name.find(')')
+        name = name[:st] + '...' + name[ed + 1:]
+    return name
+
+
+# annotations is pred results
+def visualize(frames, annotations, plate=plate_blue, max_num=5):
+    """Visualize frames with predicted annotations.
+    Args:
+        frames (list[np.ndarray]): Frames for visualization, note that
+            len(frames) % len(annotations) should be 0.
+        annotations (list[list[tuple]]): The predicted results.
+        plate (str): The plate used for visualization. Default: plate_blue.
+        max_num (int): Max number of labels to visualize for a person box.
+            Default: 5，目前不能大于5.
+    Returns:
+        list[np.ndarray]: Visualized frames.
+    """
+
+    assert max_num + 1 <= len(plate)
+    plate = [x[::-1] for x in plate]
+    frames_ = cp.deepcopy(frames)
+    nf, na = len(frames), len(annotations)
+    assert nf % na == 0
+    nfpa = len(frames) // len(annotations)
+    anno = None
+    h, w, _ = frames[0].shape
+    # proposals被归一化需要还原真实坐标值
+    scale_ratio = np.array([w, h, w, h])
+
+    for i in range(na):
+        anno = annotations[i]
+        if anno is None:
+            continue
+        for j in range(nfpa):
+            ind = i * nfpa + j
+            frame = frames_[ind]
+            for ann in anno:
+                box = ann[0]
+                label = ann[1]
+                if not len(label):
+                    continue
+                score = ann[2]
+                box = (box * scale_ratio).astype(np.int64)
+                st, ed = tuple(box[:2]), tuple(box[2:])
+                cv2.rectangle(frame, st, ed, plate[0], 2)
+                for k, lb in enumerate(label):
+                    if k >= max_num:
+                        break
+                    text = abbrev(lb)
+                    text = ': '.join([text, str(score[k])])
+                    location = (0 + st[0], 18 + k * 18 + st[1])
+                    textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE,
+                                               THICKNESS)[0]
+                    textwidth = textsize[0]
+                    diag0 = (location[0] + textwidth, location[1] - 14)
+                    diag1 = (location[0], location[1] + 2)
+                    cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1)
+                    cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
+                                FONTCOLOR, THICKNESS, LINETYPE)
+
+    return frames_
+
+
+def frame_extraction(video_path, target_dir):
+    """Extract frames given video_path.
+    Args:
+        video_path (str): The video_path.
+    """
+
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir, exist_ok=True)
+
+    # Should be able to handle videos up to several hours
+    frame_tmpl = osp.join(target_dir, '{:05d}.jpg')
+    vid = cv2.VideoCapture(video_path)
+
+    FPS = int(vid.get(5))
+
+    frames = []
+    frame_paths = []
+
+    flag, frame = vid.read()
+    index = 1
+    while flag:
+        frames.append(frame)
+        frame_path = frame_tmpl.format(index)
+        frame_paths.append(frame_path)
+        cv2.imwrite(frame_path, frame)
+        index += 1
+        flag, frame = vid.read()
+    return frame_paths, frames, FPS
+
+
+def parse_args():
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+
+    # general params
+    parser = argparse.ArgumentParser("PaddleVideo Inference model script")
+    parser.add_argument('-c',
+                        '--config',
+                        type=str,
+                        default='configs/example.yaml',
+                        help='config file path')
+
+    parser.add_argument('--video_path', help='video file/url')
+
+    parser.add_argument('-o',
+                        '--override',
+                        action='append',
+                        default=[],
+                        help='config options to be overridden')
+    parser.add_argument('-w',
+                        '--weights',
+                        type=str,
+                        help='weights for finetuning or testing')
+
+    #detection_model_name
+    parser.add_argument('--detection_model_name',
+                        help='the name of detection model ')
+    # detection_model_weights
+    parser.add_argument('--detection_model_weights',
+                        help='the weights path of detection model ')
+
+    # params for predict
+    parser.add_argument('--out-filename',
+                        default='ava_det_demo.mp4',
+                        help='output filename')
+    parser.add_argument('--predict-stepsize',
+                        default=8,
+                        type=int,
+                        help='give out a prediction per n frames')
+    parser.add_argument(
+        '--output-stepsize',
+        default=4,
+        type=int,
+        help=('show one frame per n frames in the demo, we should have: '
+              'predict_stepsize % output_stepsize == 0'))
+    parser.add_argument('--output-fps',
+                        default=6,
+                        type=int,
+                        help='the fps of demo video output')
+
+    return parser.parse_args()
+
+
+# 一帧的结果。根据概率大小进行排序
+def pack_result(human_detection, result):
+    """Short summary.
+    Args:
+        human_detection (np.ndarray): Human detection result.
+        result (type): The predicted label of each human proposal.
+    Returns:
+        tuple: Tuple of human proposal, label name and label score.
+    """
+    results = []
+    if result is None:
+        return None
+
+    for prop, res in zip(human_detection, result):
+        res.sort(key=lambda x: -x[1])
+
+        results.append((prop, [x[0] for x in res], [x[1] for x in res]))
+
+    return results
+
+
+# 构造数据处理需要的results
+def get_timestep_result(frame_dir, timestamp, clip_len, frame_interval, FPS):
+    result = {}
+
+    result["frame_dir"] = frame_dir
+
+    frame_num = len(os.listdir(frame_dir))
+
+    dir_name = frame_dir.split("/")[-1]
+    result["video_id"] = dir_name
+
+    result['timestamp'] = timestamp
+
+    timestamp_str = '{:04d}'.format(timestamp)
+    img_key = dir_name + "," + timestamp_str
+    result['img_key'] = img_key
+
+    result['shot_info'] = (1, frame_num)
+    result['fps'] = FPS
+
+    result['suffix'] = '{:05}.jpg'
+
+    result['timestamp_start'] = 1
+    result['timestamp_end'] = int(frame_num / result['fps'])
+
+    return result
+
+
+def detection_inference(frame_paths, output_dir, model_name, weights_path):
+    """Detect human boxes given frame paths.
+    Args:
+        frame_paths (list[str]): The paths of frames to do detection inference.
+    Returns:
+        list[np.ndarray]: The human detection results.
+    """
+
+    detection_cfg = ppdet.model_zoo.get_config_file(model_name)
+    detection_cfg = ppdet.core.workspace.load_config(detection_cfg)
+    detection_trainer = ppdet.engine.Trainer(detection_cfg, mode='test')
+    detection_trainer.load_weights(weights_path)
+
+    print('Performing Human Detection for each frame')
+
+    detection_trainer.predict(frame_paths, output_dir=output_dir, save_txt=True)
+
+    print("finish object detection")
+
+    results = []
+
+    for frame_path in frame_paths:
+        (file_dir, file_name) = os.path.split(frame_path)
+        (file_path, ext) = os.path.splitext(frame_path)
+
+        txt_file_name = file_name.replace(ext, ".txt")
+        txt_path = os.path.join(output_dir, txt_file_name)
+        results.append(txt_path)
+
+    return results
+
+
+def get_detection_result(txt_file_path, img_h, img_w, person_det_score_thr):
+    """
+    根据检测结果文件得到图像中人的检测框(proposals)和置信度（scores）
+    txt_file_path:检测结果存放路径
+    img_h:图像高度
+    img_w:图像宽度
+    """
+
+    proposals = []
+    scores = []
+
+    with open(txt_file_path, 'r') as detection_file:
+        lines = detection_file.readlines()
+        for line in lines:  # person 0.9842637181282043 0.0 469.1407470703125 944.7770385742188 831.806396484375
+            items = line.split(" ")
+            if items[0] != 'person':  #只要人
+                continue
+
+            score = items[1]
+
+            if (float)(score) < person_det_score_thr:
+                continue
+
+            x1 = (float(items[2])) / img_w
+            y1 = ((float)(items[3])) / img_h
+            box_w = ((float)(items[4]))
+            box_h = ((float)(items[5]))
+
+            x2 = (float(items[2]) + box_w) / img_w
+            y2 = (float(items[3]) + box_h) / img_h
+
+            scores.append(score)
+
+            proposals.append([x1, y1, x2, y2])
+
+    return np.array(proposals), np.array(scores)
+
+
+@paddle.no_grad()
+def main(args):
+    config = get_config(args.config, show=False)  #parse config file
+
+    # extract frames from video
+    video_path = args.video_path
+    frame_dir = 'tmp_frames'
+    frame_paths, frames, FPS = frame_extraction(video_path, frame_dir)
+
+    num_frame = len(frame_paths)  #视频秒数*FPS
+    assert num_frame != 0
+    print("Frame Number：", num_frame)
+
+    # 帧图像高度和宽度
+    h, w, _ = frames[0].shape
+
+    # Get clip_len, frame_interval and calculate center index of each clip
+    data_process_pipeline = build_pipeline(config.PIPELINE.test)  #测试时输出处理流水配置
+
+    clip_len = config.PIPELINE.test.sample['clip_len']
+    assert clip_len % 2 == 0, 'We would like to have an even clip_len'
+    frame_interval = config.PIPELINE.test.sample['frame_interval']
+
+    # 此处关键帧每秒取一个
+    clip_len = config.PIPELINE.test.sample['clip_len']
+    assert clip_len % 2 == 0, 'We would like to have an even clip_len'
+    frame_interval = config.PIPELINE.test.sample['frame_interval']
+    window_size = clip_len * frame_interval
+    timestamps = np.arange(window_size // 2, (num_frame + 1 - window_size // 2),
+                           args.predict_stepsize)
+    print("timetamps number:", len(timestamps))
+
+    # get selected frame list according to timestamps
+    selected_frame_list = []
+    for timestamp in timestamps:
+        selected_frame_list.append(frame_paths[timestamp - 1])
+
+    # Load label_map
+    label_map_path = config.DATASET.test['label_file']
+    categories, class_whitelist = read_labelmap(open(label_map_path))
+    label_map = {}
+    for item in categories:
+        id = item['id']
+        name = item['name']
+        label_map[id] = name
+
+    # Construct model.
+    if config.MODEL.backbone.get('pretrained'):
+        config.MODEL.backbone.pretrained = ''  # disable pretrain model init
+    model = build_model(config.MODEL)
+
+    model.eval()
+    state_dicts = load(args.weights)
+    model.set_state_dict(state_dicts)
+
+    detection_result_dir = 'tmp_detection'
+    detection_model_name = args.detection_model_name
+    detection_model_weights = args.detection_model_weights
+    detection_txt_list = detection_inference(selected_frame_list,
+                                             detection_result_dir,
+                                             detection_model_name,
+                                             detection_model_weights)
+    assert len(detection_txt_list) == len(timestamps)
+
+    print('Performing SpatioTemporal Action Detection for each clip')
+    human_detections = []
+    predictions = []
+
+    index = 0
+    for timestamp, detection_txt_path in zip(timestamps, detection_txt_list):
+        proposals, scores = get_detection_result(
+            detection_txt_path, h, w,
+            (float)(config.DATASET.test['person_det_score_thr']))
+        if proposals.shape[0] == 0:
+            predictions.append(None)
+            human_detections.append(None)
+            continue
+
+        human_detections.append(proposals)
+
+        result = get_timestep_result(frame_dir,
+                                     timestamp,
+                                     clip_len,
+                                     frame_interval,
+                                     FPS=FPS)
+        result["proposals"] = proposals
+        result["scores"] = scores
+
+        new_result = data_process_pipeline(result)
+        proposals = new_result['proposals']
+
+        img_slow = new_result['imgs'][0]
+        img_slow = img_slow[np.newaxis, :]
+        img_fast = new_result['imgs'][1]
+        img_fast = img_fast[np.newaxis, :]
+
+        proposals = proposals[np.newaxis, :]
+
+        scores = scores[np.newaxis, :]
+
+        img_shape = np.asarray(new_result['img_shape'])
+        img_shape = img_shape[np.newaxis, :]
+
+        data = [
+            paddle.to_tensor(img_slow, dtype='float32'),
+            paddle.to_tensor(img_fast, dtype='float32'),
+            paddle.to_tensor(proposals, dtype='float32'), scores,
+            paddle.to_tensor(img_shape, dtype='int32')
+        ]
+
+        with paddle.no_grad():
+            result = model(data, mode='infer')
+
+            result = result[0]
+            prediction = []
+
+            person_num = proposals.shape[1]
+            # N proposals
+            for i in range(person_num):
+                prediction.append([])
+
+            # Perform action score thr
+            for i in range(len(result)):
+                if i + 1 not in class_whitelist:
+                    continue
+                for j in range(person_num):
+                    if result[i][j, 4] > config.MODEL.head['action_thr']:
+                        prediction[j].append((label_map[i + 1], result[i][j,
+                                                                          4]))
+            predictions.append(prediction)
+
+        index = index + 1
+        if index % 10 == 0:
+            print(index, "/", len(timestamps))
+
+    results = []
+    for human_detection, prediction in zip(human_detections, predictions):
+        results.append(pack_result(human_detection, prediction))
+
+    def dense_timestamps(timestamps, n):
+        """Make it nx frames."""
+        old_frame_interval = (timestamps[1] - timestamps[0])
+        start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
+        new_frame_inds = np.arange(
+            len(timestamps) * n) * old_frame_interval / n + start
+        return new_frame_inds.astype(np.int)
+
+    dense_n = int(args.predict_stepsize / args.output_stepsize)  #30
+    frames = [
+        cv2.imread(frame_paths[i - 1])
+        for i in dense_timestamps(timestamps, dense_n)
+    ]
+
+    vis_frames = visualize(frames, results)
+
+    try:
+        import moviepy.editor as mpy
+    except ImportError:
+        raise ImportError('Please install moviepy to enable output file')
+
+    vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],
+                                fps=args.output_fps)
+    vid.write_videofile(args.out_filename)
+    print("finish write !")
+
+    # delete tmp files and dirs
+    shutil.rmtree(frame_dir)
+    shutil.rmtree(detection_result_dir)
+
+
+if __name__ == '__main__':
+    args = parse_args()  #解析参数
+    main(args)
diff --git a/Bank_second_part/detect_process/tools/export_model.py b/Bank_second_part/detect_process/tools/export_model.py
new file mode 100644
index 0000000..401091a
--- /dev/null
+++ b/Bank_second_part/detect_process/tools/export_model.py
@@ -0,0 +1,267 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import os.path as osp
+import sys
+
+import paddle
+from paddle.jit import to_static
+from paddle.static import InputSpec
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
+
+from paddlevideo.modeling.builder import build_model
+from paddlevideo.utils import get_config
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("PaddleVideo export model script")
+    parser.add_argument('-c',
+                        '--config',
+                        type=str,
+                        default='configs/example.yaml',
+                        help='config file path')
+    parser.add_argument('--override',
+                        action='append',
+                        default=[],
+                        help='config options to be overridden')
+    parser.add_argument("-p",
+                        "--pretrained_params",
+                        default='./best.pdparams',
+                        type=str,
+                        help='params path')
+    parser.add_argument("-o",
+                        "--output_path",
+                        type=str,
+                        default="./inference",
+                        help='output path')
+
+    parser.add_argument('--save_name',
+                        type=str,
+                        default=None,
+                        help='specify the exported inference \
+                             files(pdiparams and pdmodel) name,\
+                             only used in TIPC')
+
+    return parser.parse_args()
+
+
+def trim_config(cfg):
+    """
+    Reuse the trainging config will bring useless attributes, such as: backbone.pretrained model.
+    and some build phase attributes should be overrided, such as: backbone.num_seg.
+    Trim it here.
+    """
+    model_name = cfg.model_name
+    if cfg.MODEL.get('backbone') and cfg.MODEL.backbone.get('pretrained'):
+        cfg.MODEL.backbone.pretrained = ""  # not ued when inference
+
+    # for distillation
+    if cfg.MODEL.get('models'):
+        if cfg.MODEL.models[0]['Teacher']['backbone'].get('pretrained'):
+            cfg.MODEL.models[0]['Teacher']['backbone']['pretrained'] = ""
+        if cfg.MODEL.models[1]['Student']['backbone'].get('pretrained'):
+            cfg.MODEL.models[1]['Student']['backbone']['pretrained'] = ""
+
+    return cfg, model_name
+
+
+def get_input_spec(cfg, model_name):
+    if model_name in ['ppTSM', 'TSM', 'MoViNet', 'ppTSMv2']:
+        input_spec = [[
+            InputSpec(
+                shape=[None, cfg.num_seg, 3, cfg.target_size, cfg.target_size],
+                dtype='float32'),
+        ]]
+    elif model_name in ['TokenShiftVisionTransformer']:
+        input_spec = [[
+            InputSpec(shape=[
+                None, 3, cfg.num_seg * 3, cfg.target_size, cfg.target_size
+            ],
+                      dtype='float32'),
+        ]]
+    elif model_name in ['TSN', 'ppTSN']:
+        input_spec = [[
+            InputSpec(shape=[
+                None, cfg.num_seg * 10, 3, cfg.target_size, cfg.target_size
+            ],
+                      dtype='float32'),
+        ]]
+    elif model_name in ['BMN']:
+        input_spec = [[
+            InputSpec(shape=[None, cfg.feat_dim, cfg.tscale],
+                      dtype='float32',
+                      name='feat_input'),
+        ]]
+    elif model_name in ['TimeSformer', 'ppTimeSformer']:
+        input_spec = [[
+            InputSpec(shape=[
+                None, 3, cfg.num_seg * 3, cfg.target_size, cfg.target_size
+            ],
+                      dtype='float32'),
+        ]]
+    elif model_name in ['VideoSwin']:
+        input_spec = [[
+            InputSpec(shape=[
+                None, 3, cfg.num_seg * cfg.seg_len * 1, cfg.target_size,
+                cfg.target_size
+            ],
+                      dtype='float32'),
+        ]]
+    elif model_name in ['VideoSwin_TableTennis']:
+        input_spec = [[
+            InputSpec(shape=[
+                None, 3, cfg.num_seg * cfg.seg_len * 3, cfg.target_size,
+                cfg.target_size
+            ],
+                      dtype='float32'),
+        ]]
+    elif model_name in ['AttentionLSTM']:
+        input_spec = [[
+            InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[0]],
+                      dtype='float32'),  # for rgb_data
+            InputSpec(shape=[
+                None,
+            ], dtype='int64'),  # for rgb_len
+            InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[0]],
+                      dtype='float32'),  # for rgb_mask
+            InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[1]],
+                      dtype='float32'),  # for audio_data
+            InputSpec(shape=[
+                None,
+            ], dtype='int64'),  # for audio_len
+            InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[1]],
+                      dtype='float32'),  # for audio_mask
+        ]]
+    elif model_name in ['SlowFast']:
+        input_spec = [[
+            InputSpec(shape=[
+                None, 3, cfg.num_frames // cfg.alpha, cfg.target_size,
+                cfg.target_size
+            ],
+                      dtype='float32',
+                      name='slow_input'),
+            InputSpec(shape=[
+                None, 3, cfg.num_frames, cfg.target_size, cfg.target_size
+            ],
+                      dtype='float32',
+                      name='fast_input'),
+        ]]
+    elif model_name in ['STGCN', 'AGCN', 'CTRGCN']:
+        input_spec = [[
+            InputSpec(shape=[
+                None, cfg.num_channels, cfg.window_size, cfg.vertex_nums,
+                cfg.person_nums
+            ],
+                      dtype='float32'),
+        ]]
+    # 由于在模型运行过程中涉及到第一维乘human个数(N*M), 所以这里用1作为shape
+    elif model_name in ['AGCN2s']:
+        input_spec = [[
+            InputSpec(shape=[
+                1, cfg.num_channels, cfg.window_size, cfg.vertex_nums,
+                cfg.person_nums
+            ],
+                      dtype='float32'),
+        ]]
+    elif model_name in ['TransNetV2']:
+        input_spec = [[
+            InputSpec(shape=[
+                None,
+                cfg.num_frames,
+                cfg.height,
+                cfg.width,
+                cfg.num_channels,
+            ],
+                      dtype='float32'),
+        ]]
+    elif model_name in ['MSTCN', 'ASRF']:
+        input_spec = [[
+            InputSpec(shape=[None, cfg.num_channels, None], dtype='float32'),
+        ]]
+    elif model_name in ['ADDS']:
+        input_spec = [[
+            InputSpec(shape=[None, cfg.num_channels, cfg.height, cfg.width],
+                      dtype='float32'),
+        ]]
+    elif model_name in ['AVA_SlowFast_FastRcnn']:
+        input_spec = [[
+            InputSpec(shape=[
+                None, 3, cfg.num_frames // cfg.alpha, cfg.target_size,
+                cfg.target_size
+            ],
+                      dtype='float32',
+                      name='slow_input'),
+            InputSpec(shape=[
+                None, 3, cfg.num_frames, cfg.target_size, cfg.target_size
+            ],
+                      dtype='float32',
+                      name='fast_input'),
+            InputSpec(shape=[None, None, 4], dtype='float32', name='proposals'),
+            InputSpec(shape=[None, 2], dtype='float32', name='img_shape')
+        ]]
+    elif model_name in ['PoseC3D']:
+        input_spec = [[
+            InputSpec(shape=[None, 1, 17, 48, 56, 56], dtype='float32'),
+        ]]
+    elif model_name in ['YOWO']:
+        input_spec = [[
+            InputSpec(shape=[
+                1, 3, cfg.num_seg, cfg.target_size, cfg.target_size
+            ],
+                      dtype='float32'),
+        ]]
+    return input_spec
+
+
+def main():
+    args = parse_args()
+    cfg, model_name = trim_config(
+        get_config(args.config, overrides=args.override, show=False))
+
+    print(f"Building model({model_name})...")
+    model = build_model(cfg.MODEL)
+    assert osp.isfile(
+        args.pretrained_params
+    ), f"pretrained params ({args.pretrained_params} is not a file path.)"
+
+    if not os.path.isdir(args.output_path):
+        os.makedirs(args.output_path)
+
+    print(f"Loading params from ({args.pretrained_params})...")
+    params = paddle.load(args.pretrained_params)
+    model.set_dict(params)
+
+    model.eval()
+
+    # for rep nets
+    for layer in model.sublayers():
+        if hasattr(layer, "rep") and not getattr(layer, "is_repped"):
+            layer.rep()
+            
+    input_spec = get_input_spec(cfg.INFERENCE, model_name)
+    model = to_static(model, input_spec=input_spec)
+    paddle.jit.save(
+        model,
+        osp.join(args.output_path,
+                 model_name if args.save_name is None else args.save_name))
+    print(
+        f"model ({model_name}) has been already saved in ({args.output_path}).")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Bank_second_part/detect_process/tools/predict.py b/Bank_second_part/detect_process/tools/predict.py
new file mode 100644
index 0000000..bc9bd8c
--- /dev/null
+++ b/Bank_second_part/detect_process/tools/predict.py
@@ -0,0 +1,327 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+from os import path as osp
+import paddle
+from paddle import inference
+from paddle.inference import Config, create_predictor
+
+from utils import build_inference_helper
+from paddlevideo.utils import get_config
+
+
+def parse_args():
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+
+    # general params
+    parser = argparse.ArgumentParser("PaddleVideo Inference model script")
+    parser.add_argument(
+        '-c',
+        '--config',
+        type=str,
+        default='configs/example.yaml',
+        help='config file path')
+    parser.add_argument(
+        '-o',
+        '--override',
+        action='append',
+        default=[],
+        help='config options to be overridden')
+    parser.add_argument("-i", "--input_file", type=str, help="input file path")
+    parser.add_argument(
+        "--time_test_file",
+        type=str2bool,
+        default=False,
+        help="whether input time test file")
+    parser.add_argument("--model_file", type=str)
+    parser.add_argument("--params_file", type=str)
+
+    # params for paddle predict
+    parser.add_argument("-b", "--batch_size", type=int, default=1)
+    parser.add_argument("--use_gpu", type=str2bool, default=True)
+    parser.add_argument("--use_xpu", type=str2bool, default=False)
+    parser.add_argument("--use_npu", type=str2bool, default=False)
+    parser.add_argument("--precision", type=str, default="fp32")
+    parser.add_argument("--ir_optim", type=str2bool, default=True)
+    parser.add_argument("--use_tensorrt", type=str2bool, default=False)
+    parser.add_argument("--gpu_mem", type=int, default=8000)
+    parser.add_argument("--enable_benchmark", type=str2bool, default=False)
+    parser.add_argument("--enable_mkldnn", type=str2bool, default=False)
+    parser.add_argument("--cpu_threads", type=int, default=None)
+    parser.add_argument("--disable_glog", type=str2bool, default=False)
+    # parser.add_argument("--hubserving", type=str2bool, default=False)  #TODO
+
+    return parser.parse_args()
+
+
+def create_paddle_predictor(args, cfg):
+    config = Config(args.model_file, args.params_file)
+    if args.use_gpu:
+        config.enable_use_gpu(args.gpu_mem, 0)
+    elif args.use_npu:
+        config.enable_npu()
+    elif args.use_xpu:
+        config.enable_xpu()
+    else:
+        config.disable_gpu()
+        if args.cpu_threads:
+            config.set_cpu_math_library_num_threads(args.cpu_threads)
+        if args.enable_mkldnn:
+            # cache 10 different shapes for mkldnn to avoid memory leak
+            config.set_mkldnn_cache_capacity(10)
+            config.enable_mkldnn()
+            if args.precision == "fp16":
+                config.enable_mkldnn_bfloat16()
+
+    # config.disable_glog_info()
+    config.switch_ir_optim(args.ir_optim)  # default true
+    if args.use_tensorrt:
+        # choose precision
+        if args.precision == "fp16":
+            precision = inference.PrecisionType.Half
+        elif args.precision == "int8":
+            precision = inference.PrecisionType.Int8
+        else:
+            precision = inference.PrecisionType.Float32
+
+        # calculate real max batch size during inference when tenrotRT enabled
+        max_batch_size = args.batch_size
+        if 'num_seg' in cfg.INFERENCE:
+            # num_seg: number of segments when extracting frames.
+            # seg_len: number of frames extracted within a segment, default to 1.
+            # num_views: the number of video frame groups obtained by cropping and flipping,
+            # uniformcrop=3, tencrop=10, centercrop=1.
+            num_seg = cfg.INFERENCE.num_seg
+            seg_len = cfg.INFERENCE.get('seg_len', 1)
+            num_views = 1
+            if 'tsm' in cfg.model_name.lower():
+                num_views = 1  # CenterCrop
+            elif 'tsn' in cfg.model_name.lower():
+                num_views = 10  # TenCrop
+            elif 'timesformer' in cfg.model_name.lower():
+                num_views = 3  # UniformCrop
+            elif 'videoswin' in cfg.model_name.lower():
+                num_views = 3  # UniformCrop
+            elif 'tokenshift' in cfg.model_name.lower():
+                num_views = 3  # UniformCrop
+            max_batch_size = args.batch_size * num_views * num_seg * seg_len
+        config.enable_tensorrt_engine(
+            precision_mode=precision, max_batch_size=max_batch_size)
+
+    config.enable_memory_optim()
+    # use zero copy
+    config.switch_use_feed_fetch_ops(False)
+
+    # disable glog
+    if args.disable_glog:
+        config.disable_glog_info()
+
+    # for ST-GCN tensorRT case usage
+    # config.delete_pass("shuffle_channel_detect_pass")
+
+    predictor = create_predictor(config)
+
+    return config, predictor
+
+
+def parse_file_paths(input_path: str) -> list:
+    if osp.isfile(input_path):
+        files = [
+            input_path,
+        ]
+    else:
+        files = os.listdir(input_path)
+        files = [
+            file for file in files
+            if (file.endswith(".avi") or file.endswith(".mp4"))
+        ]
+        files = [osp.join(input_path, file) for file in files]
+    return files
+
+
+def main():
+    """predict using paddle inference model
+    """
+    args = parse_args()
+    cfg = get_config(args.config, overrides=args.override, show=False)
+
+    model_name = cfg.model_name
+    print(f"Inference model({model_name})...")
+    InferenceHelper = build_inference_helper(cfg.INFERENCE)
+
+    inference_config, predictor = create_paddle_predictor(args, cfg)
+
+    # get input_tensor and output_tensor
+    input_names = predictor.get_input_names()
+    output_names = predictor.get_output_names()
+    input_tensor_list = []
+    output_tensor_list = []
+    for item in input_names:
+        input_tensor_list.append(predictor.get_input_handle(item))
+    for item in output_names:
+        output_tensor_list.append(predictor.get_output_handle(item))
+
+    # get the absolute file path(s) to be processed
+    if model_name in ["MSTCN", "ASRF"]:
+        files = InferenceHelper.get_process_file(args.input_file)
+    else:
+        files = parse_file_paths(args.input_file)
+
+    if model_name == 'TransNetV2':
+        for file in files:
+            inputs = InferenceHelper.preprocess(file)
+            outputs = []
+            for input in inputs:
+                # Run inference
+                for i in range(len(input_tensor_list)):
+                    input_tensor_list[i].copy_from_cpu(input)
+                predictor.run()
+                output = []
+                for j in range(len(output_tensor_list)):
+                    output.append(output_tensor_list[j].copy_to_cpu())
+                outputs.append(output)
+
+            # Post process output
+            InferenceHelper.postprocess(outputs)
+
+    elif model_name == 'AVA_SlowFast_FastRcnn':
+        for file in files:  # for videos
+            inputs = InferenceHelper.preprocess(file)
+            outputs = []
+            for input in inputs:
+                # Run inference
+                input_len = len(input_tensor_list)
+
+                for i in range(input_len):
+                    if type(input[i]) == paddle.Tensor:
+                        input_tmp = input[i].numpy()
+                    else:
+                        input_tmp = input[i]
+                    input_tensor_list[i].copy_from_cpu(input_tmp)
+                predictor.run()
+                output = []
+                for j in range(len(output_tensor_list)):
+                    output.append(output_tensor_list[j].copy_to_cpu())
+                outputs.append(output)
+
+            # Post process output
+            InferenceHelper.postprocess(outputs)
+    elif model_name == 'YOWO':
+        for file in files:  # for videos
+            (_, filename) = os.path.split(file)
+            (filename, _) = os.path.splitext(filename)
+            save_dir = osp.join('inference', 'YOWO_infer')
+            if not osp.exists('inference'):
+                os.mkdir('inference')
+            if not osp.exists(save_dir):
+                os.mkdir(save_dir)
+            save_path = osp.join(save_dir, filename)
+            if not osp.exists(save_path):
+                os.mkdir(save_path)
+            inputs, frames = InferenceHelper.preprocess(file)
+            for idx, input in enumerate(inputs):
+                # Run inference
+                outputs = []
+                input_len = len(input_tensor_list)
+                for i in range(input_len):
+                    input_tensor_list[i].copy_from_cpu(input[i])
+                predictor.run()
+                for j in range(len(output_tensor_list)):
+                    outputs.append(output_tensor_list[j].copy_to_cpu())
+                # Post process output
+                InferenceHelper.postprocess(outputs, frames[idx], osp.join(save_path, str(idx).zfill(3)))
+    else:
+        if args.enable_benchmark:
+            num_warmup = 3
+
+            # instantiate auto log
+            try:
+                import auto_log
+            except ImportError as e:
+                print(f"{e}, [git+https://github.com/LDOUBLEV/AutoLog] "
+                      f"package and it's dependencies is required for "
+                      f"python-inference when enable_benchmark=True.")
+            pid = os.getpid()
+            autolog = auto_log.AutoLogger(
+                model_name=cfg.model_name,
+                model_precision=args.precision,
+                batch_size=args.batch_size,
+                data_shape="dynamic",
+                save_path="./output/auto_log.lpg",
+                inference_config=inference_config,
+                pids=pid,
+                process_name=None,
+                gpu_ids=0 if args.use_gpu else None,
+                time_keys=[
+                    'preprocess_time', 'inference_time', 'postprocess_time'
+                ],
+                warmup=num_warmup)
+            if not args.time_test_file:
+                test_video_num = 15
+                files = [args.input_file for _ in range(test_video_num)]
+            else:
+                f_input = open(args.input_file, 'r')
+                files = [i.strip() for i in f_input.readlines()]
+                test_video_num = len(files)
+                f_input.close()
+
+        # Inferencing process
+        batch_num = args.batch_size
+        for st_idx in range(0, len(files), batch_num):
+            ed_idx = min(st_idx + batch_num, len(files))
+
+            # auto log start
+            if args.enable_benchmark:
+                autolog.times.start()
+
+            # Pre process batched input
+            batched_inputs = InferenceHelper.preprocess_batch(
+                files[st_idx:ed_idx])
+
+            # get pre process time cost
+            if args.enable_benchmark:
+                autolog.times.stamp()
+
+            # run inference
+            for i in range(len(input_tensor_list)):
+                input_tensor_list[i].copy_from_cpu(batched_inputs[i])
+            predictor.run()
+
+            batched_outputs = []
+            for j in range(len(output_tensor_list)):
+                batched_outputs.append(output_tensor_list[j].copy_to_cpu())
+
+            # get inference process time cost
+            if args.enable_benchmark:
+                autolog.times.stamp()
+
+            InferenceHelper.postprocess(batched_outputs,
+                                        not args.enable_benchmark)
+
+            # get post process time cost
+            if args.enable_benchmark:
+                autolog.times.end(stamp=True)
+
+            # time.sleep(0.01)  # sleep for T4 GPU
+
+    # report benchmark log if enabled
+    if args.enable_benchmark:
+        autolog.report()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Bank_second_part/detect_process/tools/summary.py b/Bank_second_part/detect_process/tools/summary.py
new file mode 100644
index 0000000..28bd6f7
--- /dev/null
+++ b/Bank_second_part/detect_process/tools/summary.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import sys
+import os.path as osp
+
+import paddle
+import paddle.nn.functional as F
+from paddle.jit import to_static
+import paddleslim
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
+
+from paddlevideo.modeling.builder import build_model
+from paddlevideo.utils import get_config
+
+
+def parse_args():
+
+    parser = argparse.ArgumentParser("PaddleVideo Summary")
+    parser.add_argument('-c',
+                        '--config',
+                        type=str,
+                        default='configs/example.yaml',
+                        help='config file path')
+
+    parser.add_argument("--img_size", type=int, default=224)
+    parser.add_argument("--num_seg", type=int, default=8)
+    parser.add_argument("--FLOPs",
+                        action="store_true",
+                        help="whether to print FLOPs")
+
+    return parser.parse_args()
+
+
+def _trim(cfg, args):
+    """
+    Reuse the trainging config will bring useless attribute, such as: backbone.pretrained model. Trim it here.
+    """
+    model_name = cfg.model_name
+    cfg = cfg.MODEL
+    cfg.backbone.pretrained = ""
+
+    if 'num_seg' in cfg.backbone:
+        cfg.backbone.num_seg = args.num_seg
+    return cfg, model_name
+
+
+def main():
+    args = parse_args()
+    cfg, model_name = _trim(get_config(args.config, show=False), args)
+    print(f"Building model({model_name})...")
+    model = build_model(cfg)
+
+    img_size = args.img_size
+    num_seg = args.num_seg
+    #NOTE: only support tsm now, will refine soon
+    params_info = paddle.summary(model, (1, 1, num_seg, 3, img_size, img_size))
+    print(params_info)
+
+    if args.FLOPs:
+        flops_info = paddleslim.analysis.flops(
+            model, [1, 1, num_seg, 3, img_size, img_size])
+        print(flops_info)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Bank_second_part/detect_process/tools/utils.py b/Bank_second_part/detect_process/tools/utils.py
new file mode 100644
index 0000000..bbdd2d1
--- /dev/null
+++ b/Bank_second_part/detect_process/tools/utils.py
@@ -0,0 +1,1670 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+import sys
+from typing import List
+import pickle
+
+import cv2
+try:
+    import imageio
+except ImportError as e:
+    print(
+        f"Warning! {e}, [imageio] package and it's dependencies is required for VideoSwin."
+    )
+try:
+    import matplotlib as mpl
+    import matplotlib.cm as cm
+except ImportError as e:
+    print(
+        f"Warning! {e}, [matplotlib] package and it's dependencies is required for ADDS."
+    )
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+import pandas
+from PIL import Image
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
+from abc import abstractmethod
+
+from paddlevideo.loader.builder import build_pipeline
+from paddlevideo.loader.pipelines import (
+    AutoPadding, CenterCrop, DecodeSampler, FeatureDecoder, FrameDecoder,
+    GroupResize, Image2Array, ImageDecoder, JitterScale, MultiCrop,
+    Normalization, PackOutput, Sampler, SamplerPkl, Scale, SkeletonNorm,
+    TenCrop, ToArray, UniformCrop, VideoDecoder, SegmentationSampler,
+    SketeonCropSample, MultiCenterCrop, SketeonCropSample, UniformSampleFrames,
+    PoseDecode, PoseCompact, Resize, CenterCrop_V2, GeneratePoseTarget,
+    FormatShape, Collect)
+from paddlevideo.metrics.ava_utils import read_labelmap
+from paddlevideo.metrics.bmn_metric import boundary_choose, soft_nms
+from paddlevideo.utils import Registry, build, get_config
+from paddlevideo.modeling.framework.segmenters.utils import ASRFPostProcessing
+
+from tools.ava_predict import (detection_inference, frame_extraction,
+                               get_detection_result, get_timestep_result,
+                               pack_result, visualize)
+from paddlevideo.modeling.framework.localizers.yowo_utils import nms, get_region_boxes
+
+INFERENCE = Registry('inference')
+
+
+def build_inference_helper(cfg):
+    return build(cfg, INFERENCE)
+
+
+class Base_Inference_helper():
+    def __init__(self,
+                 num_seg=8,
+                 seg_len=1,
+                 short_size=256,
+                 target_size=224,
+                 top_k=1):
+        """Base_Inference_helper
+
+        Args:
+            num_seg (int, optional): number of segmentations of an sliced input video. Defaults to 8.
+            seg_len (int, optional): length of each segmentation. Defaults to 1.
+            short_size (int, optional): short size of input video. Defaults to 256.
+            target_size (int, optional): size of cropped video. Defaults to 224.
+            top_k (int, optional): select topk result in outputs. Defaults to 1.
+        """
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.short_size = short_size
+        self.target_size = target_size
+        self.top_k = top_k
+
+    @abstractmethod
+    def preprocess(self, input_file: str):
+        """preprocess abstractmethod
+
+        Args:
+            input_file (str): input file path.
+        """
+        pass
+
+    def preprocess_batch(self, file_list: List[str]) -> List[np.ndarray]:
+        """preprocess for file list
+
+        Args:
+            file_list (List[str]): file pathes in an list, [path1, path2, ...].
+
+        Returns:
+            List[np.ndarray]: batched inputs data, [data_batch[0], data_batch[1], ...].
+        """
+        batched_inputs = []
+        for file in file_list:
+            inputs = self.preprocess(file)
+            batched_inputs.append(inputs)
+        batched_inputs = [
+            np.concatenate([item[i] for item in batched_inputs])
+            for i in range(len(batched_inputs[0]))
+        ]
+        self.input_file = file_list
+        return batched_inputs
+
+    def postprocess(self,
+                    output: np.ndarray,
+                    print_output: bool = True,
+                    return_result: bool = False):
+        """postprocess
+
+        Args:
+            output (np.ndarray): batched output scores, shape of (batch_size, class_num).
+            print_output (bool, optional): whether to print result. Defaults to True.
+        """
+        if not isinstance(self.input_file, list):
+            self.input_file = [
+                self.input_file,
+            ]
+        output = output[0]  # [B, num_cls]
+        N = len(self.input_file)
+        if output.shape[0] != N:
+            output = output.reshape([N] + [output.shape[0] // N] +
+                                    list(output.shape[1:]))  # [N, T, C]
+            output = output.mean(axis=1)  # [N, C]
+        output = F.softmax(paddle.to_tensor(output), axis=-1).numpy()
+        results_list = []
+        for i in range(N):
+            classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]
+            classes = classes[np.argsort(-output[i, classes])]
+            scores = output[i, classes]
+            topk_class = classes[:self.top_k]
+            topk_scores = scores[:self.top_k]
+            result = {
+                "video_id": self.input_file[i],
+                "topk_class": topk_class,
+                "topk_scores": topk_scores
+            }
+            results_list.append(result)
+            if print_output:
+                print("Current video file: {0}".format(self.input_file[i]))
+                print("\ttop-{0} class: {1}".format(self.top_k, topk_class))
+                print("\ttop-{0} score: {1}".format(self.top_k, topk_scores))
+        if return_result:
+            return results_list
+
+
+@INFERENCE.register()
+class ppTSM_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 num_seg=8,
+                 seg_len=1,
+                 short_size=256,
+                 target_size=224,
+                 top_k=1):
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.short_size = short_size
+        self.target_size = target_size
+        self.top_k = top_k
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        results = {'filename': input_file}
+        img_mean = [0.485, 0.456, 0.406]
+        img_std = [0.229, 0.224, 0.225]
+        ops = [
+            VideoDecoder(backend="decord"),
+            Sampler(self.num_seg, self.seg_len, valid_mode=True),
+            Scale(self.short_size),
+            CenterCrop(self.target_size),
+            Image2Array(),
+            Normalization(img_mean, img_std)
+        ]
+        for op in ops:
+            results = op(results)
+
+        res = np.expand_dims(results['imgs'], axis=0).copy()
+        return [res]
+
+
+@INFERENCE.register()
+class ppTSN_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 num_seg=25,
+                 seg_len=1,
+                 short_size=256,
+                 target_size=224,
+                 top_k=1):
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.short_size = short_size
+        self.target_size = target_size
+        self.top_k = top_k
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        results = {'filename': input_file}
+        img_mean = [0.485, 0.456, 0.406]
+        img_std = [0.229, 0.224, 0.225]
+        ops = [
+            VideoDecoder(backend="decord"),
+            Sampler(self.num_seg,
+                    self.seg_len,
+                    valid_mode=True,
+                    select_left=True),
+            Scale(self.short_size,
+                  fixed_ratio=True,
+                  do_round=True,
+                  backend='cv2'),
+            TenCrop(self.target_size),
+            Image2Array(),
+            Normalization(img_mean, img_std)
+        ]
+        for op in ops:
+            results = op(results)
+
+        res = np.expand_dims(results['imgs'], axis=0).copy()
+        return [res]
+
+
+@INFERENCE.register()
+class BMN_Inference_helper(Base_Inference_helper):
+    def __init__(self, feat_dim, dscale, tscale, result_path):
+        self.feat_dim = feat_dim
+        self.dscale = dscale
+        self.tscale = tscale
+        self.result_path = result_path
+        if not os.path.isdir(self.result_path):
+            os.makedirs(self.result_path)
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        file_info = json.load(open(input_file))
+        self.feat_path = file_info['feat_path']
+        self.video_duration = file_info['duration_second']
+        feat = np.load(self.feat_path).astype('float32').T
+        res = np.expand_dims(feat, axis=0).copy()
+
+        return [res]
+
+    def postprocess(self, outputs, print_output=True):
+        """
+        output: list
+        """
+        pred_bm, pred_start, pred_end = outputs
+        self._gen_props(pred_bm, pred_start[0], pred_end[0], print_output)
+
+    def _gen_props(self, pred_bm, pred_start, pred_end, print_output):
+        snippet_xmins = [1.0 / self.tscale * i for i in range(self.tscale)]
+        snippet_xmaxs = [
+            1.0 / self.tscale * i for i in range(1, self.tscale + 1)
+        ]
+
+        pred_bm = pred_bm[0, 0, :, :] * pred_bm[0, 1, :, :]
+        start_mask = boundary_choose(pred_start)
+        start_mask[0] = 1.
+        end_mask = boundary_choose(pred_end)
+        end_mask[-1] = 1.
+        score_vector_list = []
+        for idx in range(self.dscale):
+            for jdx in range(self.tscale):
+                start_index = jdx
+                end_index = start_index + idx
+                if end_index < self.tscale and start_mask[
+                        start_index] == 1 and end_mask[end_index] == 1:
+                    xmin = snippet_xmins[start_index]
+                    xmax = snippet_xmaxs[end_index]
+                    xmin_score = pred_start[start_index]
+                    xmax_score = pred_end[end_index]
+                    bm_score = pred_bm[idx, jdx]
+                    conf_score = xmin_score * xmax_score * bm_score
+                    score_vector_list.append([xmin, xmax, conf_score])
+
+        cols = ["xmin", "xmax", "score"]
+        score_vector_list = np.stack(score_vector_list)
+        df = pandas.DataFrame(score_vector_list, columns=cols)
+
+        result_dict = {}
+        proposal_list = []
+        df = soft_nms(df, alpha=0.4, t1=0.55, t2=0.9)
+        for idx in range(min(100, len(df))):
+            tmp_prop={"score":df.score.values[idx], \
+                      "segment":[max(0,df.xmin.values[idx])*self.video_duration, \
+                                 min(1,df.xmax.values[idx])*self.video_duration]}
+            proposal_list.append(tmp_prop)
+
+        result_dict[self.feat_path] = proposal_list
+
+        # print top-5 predictions
+        if print_output:
+            print("Current video file: {0} :".format(self.feat_path))
+            for pred in proposal_list[:5]:
+                print(pred)
+
+        # save result
+        outfile = open(
+            os.path.join(self.result_path, "bmn_results_inference.json"), "w")
+
+        json.dump(result_dict, outfile)
+
+
+@INFERENCE.register()
+class TokenShift_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 num_seg=8,
+                 seg_len=1,
+                 short_size=256,
+                 target_size=256,
+                 top_k=1,
+                 mean=[0.5, 0.5, 0.5],
+                 std=[0.5, 0.5, 0.5]):
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.short_size = short_size
+        self.target_size = target_size
+        self.top_k = top_k
+        self.mean = mean
+        self.std = std
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        results = {'filename': input_file}
+        ops = [
+            VideoDecoder(backend='pyav', mode='test', num_seg=self.num_seg),
+            Sampler(self.num_seg, self.seg_len, valid_mode=True),
+            Normalization(self.mean, self.std, tensor_shape=[1, 1, 1, 3]),
+            Image2Array(data_format='cthw'),
+            JitterScale(self.short_size, self.short_size),
+            MultiCenterCrop(self.target_size)
+        ]
+        for op in ops:
+            results = op(results)
+
+        # [N,C,Tx3,H,W]
+        res = np.expand_dims(results['imgs'], axis=0).copy()
+        return [res]
+
+
+@INFERENCE.register()
+class TimeSformer_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 num_seg=8,
+                 seg_len=1,
+                 short_size=224,
+                 target_size=224,
+                 top_k=1,
+                 mean=[0.45, 0.45, 0.45],
+                 std=[0.225, 0.225, 0.225]):
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.short_size = short_size
+        self.target_size = target_size
+        self.top_k = top_k
+        self.mean = mean
+        self.std = std
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        results = {'filename': input_file}
+        ops = [
+            VideoDecoder(backend='pyav', mode='test', num_seg=self.num_seg),
+            Sampler(self.num_seg,
+                    self.seg_len,
+                    valid_mode=True,
+                    linspace_sample=True),
+            Normalization(self.mean, self.std, tensor_shape=[1, 1, 1, 3]),
+            Image2Array(data_format='cthw'),
+            JitterScale(self.short_size, self.short_size),
+            UniformCrop(self.target_size)
+        ]
+        for op in ops:
+            results = op(results)
+
+        # [N,C,Tx3,H,W]
+        res = np.expand_dims(results['imgs'], axis=0).copy()
+        return [res]
+
+
+@INFERENCE.register()
+class VideoSwin_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 num_seg=4,
+                 seg_len=32,
+                 frame_interval=2,
+                 short_size=224,
+                 target_size=224,
+                 top_k=1,
+                 mean=[123.675, 116.28, 103.53],
+                 std=[58.395, 57.12, 57.375]):
+
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.frame_interval = frame_interval
+        self.short_size = short_size
+        self.target_size = target_size
+        self.top_k = top_k
+        self.mean = mean
+        self.std = std
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        self.input_file = input_file
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        results = {'filename': input_file}
+        ops = [
+            VideoDecoder(backend='decord', mode='valid'),
+            Sampler(num_seg=self.num_seg,
+                    frame_interval=self.frame_interval,
+                    seg_len=self.seg_len,
+                    valid_mode=True,
+                    use_pil=False),
+            Scale(short_size=self.short_size,
+                  fixed_ratio=False,
+                  keep_ratio=True,
+                  backend='cv2',
+                  do_round=True),
+            CenterCrop(target_size=224, backend='cv2'),
+            Normalization(mean=self.mean,
+                          std=self.std,
+                          tensor_shape=[3, 1, 1, 1],
+                          inplace=True),
+            Image2Array(data_format='cthw')
+        ]
+        for op in ops:
+            results = op(results)
+
+        res = np.expand_dims(results['imgs'], axis=0).copy()
+        return [res]
+
+    def postprocess(self, output, print_output=True):
+        """
+        output: list
+        """
+        if not isinstance(self.input_file, list):
+            self.input_file = [
+                self.input_file,
+            ]
+        output = output[0]  # [B, num_cls]
+        N = len(self.input_file)
+        if output.shape[0] != N:
+            output = output.reshape([N] + [output.shape[0] // N] +
+                                    list(output.shape[1:]))  # [N, T, C]
+            output = output.mean(axis=1)  # [N, C]
+        for i in range(N):
+            classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]
+            classes = classes[np.argsort(-output[i, classes])]
+            scores = output[i, classes]
+            if print_output:
+                print("Current video file: {0}".format(self.input_file[i]))
+                for j in range(self.top_k):
+                    print("\ttop-{0} class: {1}".format(j + 1, classes[j]))
+                    print("\ttop-{0} score: {1}".format(j + 1, scores[j]))
+
+
+@INFERENCE.register()
+class VideoSwin_TableTennis_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 num_seg=1,
+                 seg_len=32,
+                 short_size=256,
+                 target_size=224,
+                 top_k=1):
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.short_size = short_size
+        self.target_size = target_size
+        self.top_k = top_k
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        results = {'frame_dir': input_file, 'suffix': 'img_{:05}.jpg'}
+        img_mean = [123.675, 116.28, 103.53]
+        img_std = [58.395, 57.12, 57.375]
+        ops = [
+            FrameDecoder(),
+            SamplerPkl(num_seg=self.num_seg,
+                       seg_len=self.seg_len,
+                       backend='cv2',
+                       valid_mode=True),
+            Scale(short_size=self.short_size,
+                  fixed_ratio=False,
+                  keep_ratio=True,
+                  backend='cv2',
+                  do_round=True),
+            UniformCrop(target_size=self.target_size, backend='cv2'),
+            Normalization(mean=img_mean,
+                          std=img_std,
+                          tensor_shape=[3, 1, 1, 1],
+                          inplace=True),
+            Image2Array(data_format='cthw')
+        ]
+        for op in ops:
+            results = op(results)
+
+        res = np.expand_dims(results['imgs'], axis=0).copy()
+        return [res]
+
+    def add_text_to_video(
+            self,
+            video_path,
+            output_dir="applications/TableTennis/ActionRecognition/results",
+            text=None):
+        os.makedirs(output_dir, exist_ok=True)
+        if video_path.endswith('.pkl'):
+            try:
+                import cPickle as pickle
+                from cStringIO import StringIO
+            except ImportError:
+                import pickle
+                from io import BytesIO
+            from PIL import Image
+            data_loaded = pickle.load(open(video_path, 'rb'), encoding='bytes')
+            _, _, frames = data_loaded
+            frames_len = len(frames)
+
+        else:
+            videoCapture = cv2.VideoCapture()
+            videoCapture.open(video_path)
+
+            fps = videoCapture.get(cv2.CAP_PROP_FPS)
+            frame_width = int(videoCapture.get(cv2.CAP_PROP_FRAME_WIDTH))
+            frame_height = int(videoCapture.get(cv2.CAP_PROP_FRAME_HEIGHT))
+
+            frames_len = videoCapture.get(cv2.CAP_PROP_FRAME_COUNT)
+            print("fps=", int(fps), "frames=", int(frames_len), "scale=",
+                  f"{frame_height}x{frame_width}")
+
+        frames_rgb_list = []
+        for i in range(int(frames_len)):
+            if video_path.endswith('.pkl'):
+                frame = np.array(
+                    Image.open(BytesIO(frames[i])).convert("RGB").resize(
+                        (240, 135)))[:, :, ::-1].astype('uint8')
+            else:
+                _, frame = videoCapture.read()
+            frame = cv2.putText(frame, text, (30, 30), cv2.FONT_HERSHEY_COMPLEX,
+                                1.0, (0, 0, 255), 2)
+            frames_rgb_list.append(frame[:, :, ::-1])  # bgr to rgb
+        if not video_path.endswith('.pkl'):
+            videoCapture.release()
+        cv2.destroyAllWindows()
+        output_filename = os.path.basename(video_path)
+        output_filename = output_filename.split('.')[0] + '.gif'
+        imageio.mimsave(f'{output_dir}/{output_filename}',
+                        frames_rgb_list,
+                        'GIF',
+                        duration=0.00085)
+
+    def postprocess(self, output, print_output=True, save_gif=True):
+        """
+        output: list
+        """
+        if not isinstance(self.input_file, list):
+            self.input_file = [
+                self.input_file,
+            ]
+        output = output[0]  # [B, num_cls]
+        N = len(self.input_file)
+        if output.shape[0] != N:
+            output = output.reshape([N] + [output.shape[0] // N] +
+                                    list(output.shape[1:]))  # [N, T, C]
+            output = output.mean(axis=1)  # [N, C]
+        for i in range(N):
+            classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]
+            classes = classes[np.argsort(-output[i, classes])]
+            scores = output[i, classes]
+            if print_output:
+                print("Current video file: {0}".format(self.input_file[i]))
+                for j in range(self.top_k):
+                    print("\ttop-{0} class: {1}".format(j + 1, classes[j]))
+                    print("\ttop-{0} score: {1}".format(j + 1, scores[j]))
+            if save_gif:
+                self.add_text_to_video(
+                    self.input_file[0],
+                    text=f"{str(classes[0])} {float(scores[0]):.5f}")
+
+
+@INFERENCE.register()
+class SlowFast_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 num_frames=32,
+                 sampling_rate=2,
+                 target_size=256,
+                 alpha=8,
+                 top_k=1):
+        self.num_frames = num_frames
+        self.sampling_rate = sampling_rate
+        self.target_size = target_size
+        self.alpha = alpha
+        self.top_k = top_k
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        results = {
+            'filename': input_file,
+            'temporal_sample_index': 0,
+            'spatial_sample_index': 0,
+            'temporal_num_clips': 1,
+            'spatial_num_clips': 1
+        }
+        img_mean = [0.45, 0.45, 0.45]
+        img_std = [0.225, 0.225, 0.225]
+        ops = [
+            DecodeSampler(self.num_frames, self.sampling_rate, test_mode=True),
+            JitterScale(self.target_size, self.target_size),
+            MultiCrop(self.target_size),
+            Image2Array(transpose=False),
+            Normalization(img_mean, img_std, tensor_shape=[1, 1, 1, 3]),
+            PackOutput(self.alpha),
+        ]
+        for op in ops:
+            results = op(results)
+
+        res = []
+        for item in results['imgs']:
+            res.append(np.expand_dims(item, axis=0).copy())
+        return res
+
+    def postprocess(self, output, print_output=True):
+        """
+        output: list
+        """
+        if not isinstance(self.input_file, list):
+            self.input_file = [
+                self.input_file,
+            ]
+        output = output[0]  # [B, num_cls]
+
+        N = len(self.input_file)
+        if output.shape[0] != N:
+            output = output.reshape([N] + [output.shape[0] // N] +
+                                    list(output.shape[1:]))  # [N, T, C]
+            output = output.mean(axis=1)  # [N, C]
+        # output = F.softmax(paddle.to_tensor(output), axis=-1).numpy() # done in it's head
+        for i in range(N):
+            classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]
+            classes = classes[np.argsort(-output[i, classes])]
+            scores = output[i, classes]
+            if print_output:
+                print("Current video file: {0}".format(self.input_file[i]))
+                for j in range(self.top_k):
+                    print("\ttop-{0} class: {1}".format(j + 1, classes[j]))
+                    print("\ttop-{0} score: {1}".format(j + 1, scores[j]))
+
+
+@INFERENCE.register()
+class STGCN_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 num_channels,
+                 window_size,
+                 vertex_nums,
+                 person_nums,
+                 top_k=1):
+        self.num_channels = num_channels
+        self.window_size = window_size
+        self.vertex_nums = vertex_nums
+        self.person_nums = person_nums
+        self.top_k = top_k
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        data = np.load(input_file)
+        results = {'data': data}
+        ops = [AutoPadding(window_size=self.window_size), SkeletonNorm()]
+        for op in ops:
+            results = op(results)
+
+        res = np.expand_dims(results['data'], axis=0).copy()
+        return [res]
+
+
+@INFERENCE.register()
+class CTRGCN_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 num_channels=3,
+                 vertex_nums=25,
+                 person_nums=2,
+                 window_size=64,
+                 p_interval=[0.95],
+                 top_k=1):
+        self.window_size = window_size
+        self.p_interval = p_interval
+        self.num_channels = num_channels
+        self.vertex_nums = vertex_nums
+        self.person_nums = person_nums
+        self.top_k = top_k
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        data = np.load(input_file)
+        results = {'data': data}
+        ops = [
+            SketeonCropSample(window_size=self.window_size,
+                              p_interval=self.p_interval)
+        ]
+        for op in ops:
+            results = op(results)
+
+        res = np.expand_dims(results['data'], axis=0).copy()
+        return [res]
+
+
+@INFERENCE.register()
+class AGCN2s_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 window_size=300,
+                 num_channels=3,
+                 vertex_nums=25,
+                 person_nums=2,
+                 top_k=1):
+        self.window_size = window_size
+        self.num_channels = num_channels
+        self.vertex_nums = vertex_nums
+        self.person_nums = person_nums
+        self.top_k = top_k
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        data = np.load(input_file)
+        results = {'data': data}
+
+        res = np.expand_dims(results['data'], axis=0).copy()
+        return [res]
+
+
+@INFERENCE.register()
+class MSTCN_Inference_helper(Base_Inference_helper):
+    def __init__(self, num_channels, actions_map_file_path, feature_path=None):
+        self.num_channels = num_channels
+        file_ptr = open(actions_map_file_path, 'r')
+        actions = file_ptr.read().split('\n')[:-1]
+        file_ptr.close()
+        self.actions_dict = dict()
+        for a in actions:
+            self.actions_dict[a.split()[1]] = int(a.split()[0])
+
+        self.feature_path = feature_path
+        self.file_name_list = []
+
+    def get_process_file(self, input_file_txt):
+        with open(input_file_txt, 'r') as file_ptr:
+            info = file_ptr.read().split('\n')[:-1]
+
+        files = []
+        for video_name in info:
+            if self.feature_path is not None:
+                file_name = video_name.split('.')[0] + ".npy"
+                input_file = os.path.join(self.feature_path, file_name)
+            else:
+                input_file = video_name
+
+            assert os.path.isfile(
+                input_file) is not None, "{0} not exists".format(input_file)
+            files.append(input_file)
+
+            self.file_name_list.append(input_file.split('/')[-1].split('.')[0])
+        return files
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, feature file list txt path
+        return: list
+        """
+        output_list = []
+
+        data = np.load(input_file)
+        results = {'video_feat': data, 'video_gt': None}
+        ops = []
+        for op in ops:
+            results = op(results)
+
+        res = np.expand_dims(results['video_feat'], axis=0).copy()
+        output_list.append(res)
+        return output_list
+
+    def postprocess(self, output, print_output=True):
+        reslut_path = os.path.join("./inference/infer_results/")
+        if not os.path.isdir(reslut_path):
+            os.makedirs(reslut_path)
+        output = [output]
+        for outputs in output:
+            output_np = outputs[0]
+            recognition = []
+            for i in range(output_np.shape[0]):
+                recognition = np.concatenate((recognition, [
+                    list(self.actions_dict.keys())[list(
+                        self.actions_dict.values()).index(output_np[i])]
+                ]))
+            recog_content = list(recognition)
+            recog_content = [line + "\n" for line in recog_content]
+
+            filename = self.file_name_list.pop(0)
+
+            write_path = os.path.join(reslut_path, filename + ".txt")
+            f = open(write_path, "w")
+            f.writelines(recog_content)
+            f.close()
+        print("result write in : " + write_path)
+
+
+@INFERENCE.register()
+class ASRF_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 num_channels,
+                 actions_map_file_path,
+                 postprocessing_method,
+                 boundary_threshold,
+                 feature_path=None):
+        self.num_channels = num_channels
+        file_ptr = open(actions_map_file_path, 'r')
+        actions = file_ptr.read().split('\n')[:-1]
+        file_ptr.close()
+        self.actions_dict = dict()
+        for a in actions:
+            self.actions_dict[a.split()[1]] = int(a.split()[0])
+
+        self.postprocessing_method = postprocessing_method
+        self.boundary_threshold = boundary_threshold
+        self.feature_path = feature_path
+        self.file_name_list = []
+
+    def get_process_file(self, input_file_txt):
+        with open(input_file_txt, 'r') as file_ptr:
+            info = file_ptr.read().split('\n')[:-1]
+
+        files = []
+        for video_name in info:
+            if self.feature_path is not None:
+                file_name = video_name.split('.')[0] + ".npy"
+                input_file = os.path.join(self.feature_path, file_name)
+            else:
+                input_file = video_name
+
+            assert os.path.isfile(
+                input_file) is not None, "{0} not exists".format(input_file)
+            files.append(input_file)
+
+            self.file_name_list.append(input_file.split('/')[-1].split('.')[0])
+        return files
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, feature file list txt path
+        return: list
+        """
+
+        output_list = []
+
+        data = np.load(input_file)
+        results = {'video_feat': data, 'video_gt': None}
+        ops = []
+        for op in ops:
+            results = op(results)
+
+        res = np.expand_dims(results['video_feat'], axis=0).copy()
+        output_list.append(res)
+        return output_list
+
+    def postprocess(self, output, print_output=True):
+        reslut_path = os.path.join("./inference/infer_results/")
+        if not os.path.isdir(reslut_path):
+            os.makedirs(reslut_path)
+        output = [output]
+        for outputs in output:
+            outputs_cls_np = outputs[0]
+            outputs_boundary_np = outputs[1]
+
+            output_np = ASRFPostProcessing(
+                outputs_cls_np,
+                outputs_boundary_np,
+                self.postprocessing_method,
+                boundary_threshold=self.boundary_threshold).numpy()[0, :]
+
+            recognition = []
+            for i in range(output_np.shape[0]):
+                recognition = np.concatenate((recognition, [
+                    list(self.actions_dict.keys())[list(
+                        self.actions_dict.values()).index(output_np[i])]
+                ]))
+            recog_content = list(recognition)
+            recog_content = [line + "\n" for line in recog_content]
+
+            filename = self.file_name_list.pop(0)
+
+            write_path = os.path.join(reslut_path, filename + ".txt")
+            f = open(write_path, "w")
+            f.writelines(recog_content)
+            f.close()
+        print("result write in : " + write_path)
+
+
+@INFERENCE.register()
+class AttentionLSTM_Inference_helper(Base_Inference_helper):
+    def __init__(
+            self,
+            num_classes,  #Optional, the number of classes to be classified.
+            feature_num,
+            feature_dims,
+            embedding_size,
+            lstm_size,
+            top_k=1):
+        self.num_classes = num_classes
+        self.feature_num = feature_num
+        self.feature_dims = feature_dims
+        self.embedding_size = embedding_size
+        self.lstm_size = lstm_size
+        self.top_k = top_k
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        results = {'filename': input_file}
+        ops = [FeatureDecoder(num_classes=self.num_classes, has_label=False)]
+        for op in ops:
+            results = op(results)
+
+        res = []
+        for modality in ['rgb', 'audio']:
+            res.append(
+                np.expand_dims(results[f'{modality}_data'], axis=0).copy())
+            res.append(
+                np.expand_dims(results[f'{modality}_len'], axis=0).copy())
+            res.append(
+                np.expand_dims(results[f'{modality}_mask'], axis=0).copy())
+        return res
+
+
+@INFERENCE.register()
+class TransNetV2_Inference_helper():
+    def __init__(self,
+                 num_frames,
+                 height,
+                 width,
+                 num_channels,
+                 threshold=0.5,
+                 output_path=None,
+                 visualize=True):
+        self._input_size = (height, width, num_channels)
+        self.output_path = output_path
+        self.len_frames = 0
+        self.threshold = threshold
+        self.visualize = visualize
+
+    def input_iterator(self, frames):
+        # return windows of size 100 where the first/last 25 frames are from the previous/next batch
+        # the first and last window must be padded by copies of the first and last frame of the video
+        no_padded_frames_start = 25
+        no_padded_frames_end = 25 + 50 - (
+            len(frames) % 50 if len(frames) % 50 != 0 else 50)  # 25 - 74
+
+        start_frame = np.expand_dims(frames[0], 0)
+        end_frame = np.expand_dims(frames[-1], 0)
+        padded_inputs = np.concatenate([start_frame] * no_padded_frames_start +
+                                       [frames] +
+                                       [end_frame] * no_padded_frames_end, 0)
+
+        ptr = 0
+        while ptr + 100 <= len(padded_inputs):
+            out = padded_inputs[ptr:ptr + 100]
+            out = out.astype(np.float32)
+            ptr += 50
+            yield out[np.newaxis]
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: iterator
+        """
+        try:
+            import ffmpeg
+        except ImportError as e:
+            print(
+                f"Warning! {e}, [ffmpeg-python] package and it's dependencies is required for TransNetV2."
+            )
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        self.input_file = input_file
+        self.filename = os.path.splitext(os.path.split(self.input_file)[1])[0]
+        video_stream, err = ffmpeg.input(
+            self.input_file).output("pipe:",
+                                    format="rawvideo",
+                                    pix_fmt="rgb24",
+                                    s="48x27").run(capture_stdout=True,
+                                                   capture_stderr=True)
+        self.frames = np.frombuffer(video_stream,
+                                    np.uint8).reshape([-1, 27, 48, 3])
+        self.len_frames = len(self.frames)
+
+        return self.input_iterator(self.frames)
+
+    def predictions_to_scenes(self, predictions):
+        predictions = (predictions > self.threshold).astype(np.uint8)
+        scenes = []
+        t, t_prev, start = -1, 0, 0
+        for i, t in enumerate(predictions):
+            if t_prev == 1 and t == 0:
+                start = i
+            if t_prev == 0 and t == 1 and i != 0:
+                scenes.append([start, i])
+            t_prev = t
+        if t == 0:
+            scenes.append([start, i])
+
+        # just fix if all predictions are 1
+        if len(scenes) == 0:
+            return np.array([[0, len(predictions) - 1]], dtype=np.int32)
+
+        return np.array(scenes, dtype=np.int32)
+
+    def visualize_predictions(self, frames, predictions):
+        from PIL import Image, ImageDraw
+
+        if isinstance(predictions, np.ndarray):
+            predictions = [predictions]
+
+        ih, iw, ic = frames.shape[1:]
+        width = 25
+
+        # pad frames so that length of the video is divisible by width
+        # pad frames also by len(predictions) pixels in width in order to show predictions
+        pad_with = width - len(frames) % width if len(
+            frames) % width != 0 else 0
+        frames = np.pad(frames, [(0, pad_with), (0, 1), (0, len(predictions)),
+                                 (0, 0)])
+
+        predictions = [np.pad(x, (0, pad_with)) for x in predictions]
+        height = len(frames) // width
+
+        img = frames.reshape([height, width, ih + 1, iw + len(predictions), ic])
+        img = np.concatenate(np.split(
+            np.concatenate(np.split(img, height), axis=2)[0], width),
+                             axis=2)[0, :-1]
+
+        img = Image.fromarray(img)
+        draw = ImageDraw.Draw(img)
+
+        # iterate over all frames
+        for i, pred in enumerate(zip(*predictions)):
+            x, y = i % width, i // width
+            x, y = x * (iw + len(predictions)) + iw, y * (ih + 1) + ih - 1
+
+            # we can visualize multiple predictions per single frame
+            for j, p in enumerate(pred):
+                color = [0, 0, 0]
+                color[(j + 1) % 3] = 255
+
+                value = round(p * (ih - 1))
+                if value != 0:
+                    draw.line((x + j, y, x + j, y - value),
+                              fill=tuple(color),
+                              width=1)
+        return img
+
+    def postprocess(self, outputs, print_output=True):
+        """
+        output: list
+        """
+        predictions = []
+        for output in outputs:
+            single_frame_logits, all_frames_logits = output
+            single_frame_pred = F.sigmoid(paddle.to_tensor(single_frame_logits))
+            all_frames_pred = F.sigmoid(paddle.to_tensor(all_frames_logits))
+            predictions.append((single_frame_pred.numpy()[0, 25:75, 0],
+                                all_frames_pred.numpy()[0, 25:75, 0]))
+        single_frame_pred = np.concatenate(
+            [single_ for single_, all_ in predictions])
+        all_frames_pred = np.concatenate(
+            [all_ for single_, all_ in predictions])
+        single_frame_predictions, all_frame_predictions = single_frame_pred[:
+                                                                            self
+                                                                            .
+                                                                            len_frames], all_frames_pred[:
+                                                                                                         self
+                                                                                                         .
+                                                                                                         len_frames]
+
+        scenes = self.predictions_to_scenes(single_frame_predictions)
+
+        if print_output:
+            print("Current video file: {0}".format(self.input_file))
+            print("\tShot Boundarys: {0}".format(scenes))
+
+        if self.output_path:
+            if not os.path.exists(self.output_path):
+                os.makedirs(self.output_path)
+            predictions = np.stack(
+                [single_frame_predictions, all_frame_predictions], 1)
+            predictions_file = os.path.join(self.output_path,
+                                            self.filename + "_predictions.txt")
+            np.savetxt(predictions_file, predictions, fmt="%.6f")
+            scenes_file = os.path.join(self.output_path,
+                                       self.filename + "_scenes.txt")
+            np.savetxt(scenes_file, scenes, fmt="%d")
+
+            if self.visualize:
+                pil_image = self.visualize_predictions(
+                    self.frames,
+                    predictions=(single_frame_predictions,
+                                 all_frame_predictions))
+                image_file = os.path.join(self.output_path,
+                                          self.filename + "_vis.png")
+                pil_image.save(image_file)
+
+
+@INFERENCE.register()
+class ADDS_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 frame_idxs=[0],
+                 num_scales=4,
+                 side_map={
+                     "2": 2,
+                     "3": 3,
+                     "l": 2,
+                     "r": 3
+                 },
+                 height=256,
+                 width=512,
+                 full_res_shape=None,
+                 num_channels=None,
+                 img_ext=".png",
+                 K=None):
+
+        self.frame_idxs = frame_idxs
+        self.num_scales = num_scales
+        self.side_map = side_map
+        self.full_res_shape = full_res_shape
+        self.img_ext = img_ext
+        self.height = height
+        self.width = width
+        self.K = K
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        results = {
+            'filename': input_file,
+            'mode': 'infer',
+            'day_or_night': 'day',
+        }
+        ops = [
+            ImageDecoder(
+                backend='pil',
+                dataset='kitti',
+                frame_idxs=self.frame_idxs,
+                num_scales=self.num_scales,
+                side_map=self.side_map,
+                full_res_shape=self.full_res_shape,
+                img_ext=self.img_ext,
+            ),
+            GroupResize(
+                height=self.height,
+                width=self.width,
+                K=self.K,
+                scale=1,
+                mode='infer',
+            ),
+            ToArray(),
+        ]
+        for op in ops:
+            results = op(results)
+        res = results['imgs'][('color', 0, 0)]
+        res = np.expand_dims(res, axis=0).copy()
+        return [res]
+
+    def postprocess(self, output, print_output, save_dir='data/'):
+        """
+        output: list
+        """
+        if not isinstance(self.input_file, list):
+            self.input_file = [
+                self.input_file,
+            ]
+        print(len(output))
+        N = len(self.input_file)
+        for i in range(N):
+            pred_depth = output[i]  # [H, W]
+            if print_output:
+                print("Current input image: {0}".format(self.input_file[i]))
+                file_name = os.path.basename(self.input_file[i]).split('.')[0]
+                save_path = os.path.join(save_dir,
+                                         file_name + "_depth" + ".png")
+                pred_depth_color = self._convertPNG(pred_depth)
+                pred_depth_color.save(save_path)
+                print(f"pred depth image saved to: {save_path}")
+
+    def _convertPNG(self, image_numpy):
+        disp_resized = cv2.resize(image_numpy, (1280, 640))
+        disp_resized_np = disp_resized
+        vmax = np.percentile(disp_resized_np, 95)
+        normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax)
+        mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
+        colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] *
+                          255).astype(np.uint8)
+        im = Image.fromarray(colormapped_im)
+        return im
+
+
+@INFERENCE.register()
+class AVA_SlowFast_FastRCNN_Inference_helper(Base_Inference_helper):
+    def __init__(self,
+                 detection_model_name,
+                 detection_model_weights,
+                 config_file_path,
+                 predict_stepsize=8,
+                 output_stepsize=4,
+                 output_fps=6,
+                 out_filename='ava_det_demo.mp4',
+                 num_frames=32,
+                 alpha=4,
+                 target_size=256):
+        self.detection_model_name = detection_model_name
+        self.detection_model_weights = detection_model_weights
+
+        self.config = get_config(config_file_path,
+                                 show=False)  #parse config file
+        self.predict_stepsize = predict_stepsize
+        self.output_stepsize = output_stepsize
+        self.output_fps = output_fps
+        self.out_filename = out_filename
+        self.num_frames = num_frames
+        self.alpha = alpha
+        self.target_size = target_size
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        """
+
+        frame_dir = 'tmp_frames'
+        self.frame_paths, frames, FPS = frame_extraction(input_file, frame_dir)
+        num_frame = len(self.frame_paths)  #视频秒数*FPS
+        assert num_frame != 0
+
+        # 帧图像高度和宽度
+        h, w, _ = frames[0].shape
+
+        # Get clip_len, frame_interval and calculate center index of each clip
+        data_process_pipeline = build_pipeline(
+            self.config.PIPELINE.test)  #测试时输出处理流水配置
+
+        clip_len = self.config.PIPELINE.test.sample['clip_len']
+        assert clip_len % 2 == 0, 'We would like to have an even clip_len'
+        frame_interval = self.config.PIPELINE.test.sample['frame_interval']
+
+        # 此处关键帧每秒取一个
+        clip_len = self.config.PIPELINE.test.sample['clip_len']
+        assert clip_len % 2 == 0, 'We would like to have an even clip_len'
+        frame_interval = self.config.PIPELINE.test.sample['frame_interval']
+        window_size = clip_len * frame_interval
+        timestamps = np.arange(window_size // 2,
+                               (num_frame + 1 - window_size // 2),
+                               self.predict_stepsize)
+
+        selected_frame_list = []
+        for timestamp in timestamps:
+            selected_frame_list.append(self.frame_paths[timestamp - 1])
+
+        # Load label_map
+        label_map_path = self.config.DATASET.test['label_file']
+        self.categories, self.class_whitelist = read_labelmap(
+            open(label_map_path))
+        label_map = {}
+        for item in self.categories:
+            id = item['id']
+            name = item['name']
+            label_map[id] = name
+
+        self.label_map = label_map
+
+        detection_result_dir = 'tmp_detection'
+        detection_model_name = self.detection_model_name
+        detection_model_weights = self.detection_model_weights
+        detection_txt_list = detection_inference(selected_frame_list,
+                                                 detection_result_dir,
+                                                 detection_model_name,
+                                                 detection_model_weights)
+        assert len(detection_txt_list) == len(timestamps)
+
+        human_detections = []
+        data_list = []
+        person_num_list = []
+
+        for timestamp, detection_txt_path in zip(timestamps,
+                                                 detection_txt_list):
+            proposals, scores = get_detection_result(
+                detection_txt_path, h, w,
+                (float)(self.config.DATASET.test['person_det_score_thr']))
+
+            if proposals.shape[0] == 0:
+                #person_num_list.append(0)
+                human_detections.append(None)
+                continue
+
+            human_detections.append(proposals)
+
+            result = get_timestep_result(frame_dir,
+                                         timestamp,
+                                         clip_len,
+                                         frame_interval,
+                                         FPS=FPS)
+            result["proposals"] = proposals
+            result["scores"] = scores
+
+            new_result = data_process_pipeline(result)
+            proposals = new_result['proposals']
+
+            img_slow = new_result['imgs'][0]
+            img_slow = img_slow[np.newaxis, :]
+            img_fast = new_result['imgs'][1]
+            img_fast = img_fast[np.newaxis, :]
+
+            proposals = proposals[np.newaxis, :]
+
+            scores = scores[np.newaxis, :]
+
+            img_shape = np.asarray(new_result['img_shape'])
+            img_shape = img_shape[np.newaxis, :]
+
+            data = [
+                paddle.to_tensor(img_slow, dtype='float32'),
+                paddle.to_tensor(img_fast, dtype='float32'),
+                paddle.to_tensor(proposals, dtype='float32'),
+                paddle.to_tensor(img_shape, dtype='int32')
+            ]
+
+            person_num = proposals.shape[1]
+            person_num_list.append(person_num)
+
+            data_list.append(data)
+
+        self.human_detections = human_detections
+        self.person_num_list = person_num_list
+        self.timestamps = timestamps
+        self.frame_dir = frame_dir
+        self.detection_result_dir = detection_result_dir
+
+        return data_list
+
+    def postprocess(self, outputs, print_output=True):
+        """
+        output: list
+        """
+        predictions = []
+
+        assert len(self.person_num_list) == len(outputs)
+
+        #print("***  self.human_detections",len( self.human_detections))
+        #print("***  outputs",len( outputs))
+
+        index = 0
+        for t_index in range(len(self.timestamps)):
+            if self.human_detections[t_index] is None:
+                predictions.append(None)
+                continue
+
+            human_detection = self.human_detections[t_index]
+
+            output = outputs[index]
+            result = output  #长度为类别个数，不包含背景
+
+            person_num = self.person_num_list[index]
+
+            index = index + 1
+
+            prediction = []
+
+            if human_detection is None:
+                predictions.append(None)
+                continue
+
+            # N proposals
+            for i in range(person_num):
+                prediction.append([])
+
+            # Perform action score thr
+            for i in range(len(result)):  # for class
+                if i + 1 not in self.class_whitelist:
+                    continue
+                for j in range(person_num):
+                    if result[i][j, 4] > self.config.MODEL.head['action_thr']:
+                        prediction[j].append(
+                            (self.label_map[i + 1], result[i][j, 4]
+                             ))  # label_map is a dict, label index start from 1
+            predictions.append(prediction)
+
+        results = []
+        for human_detection, prediction in zip(self.human_detections,
+                                               predictions):
+            results.append(pack_result(human_detection, prediction))
+
+        def dense_timestamps(timestamps, n):
+            """Make it nx frames."""
+            old_frame_interval = (timestamps[1] - timestamps[0])
+            start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
+            new_frame_inds = np.arange(
+                len(timestamps) * n) * old_frame_interval / n + start
+            return new_frame_inds.astype(np.int)
+
+        dense_n = int(self.predict_stepsize / self.output_stepsize)  #30
+        frames = [
+            cv2.imread(self.frame_paths[i - 1])
+            for i in dense_timestamps(self.timestamps, dense_n)
+        ]
+
+        vis_frames = visualize(frames, results)
+
+        try:
+            import moviepy.editor as mpy
+        except ImportError:
+            raise ImportError('Please install moviepy to enable output file')
+
+        vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],
+                                    fps=self.output_fps)
+        vid.write_videofile(self.out_filename)
+        print("finish write !")
+
+        # delete tmp files and dirs
+        shutil.rmtree(self.frame_dir)
+        shutil.rmtree(self.detection_result_dir)
+
+
+@INFERENCE.register()
+class PoseC3D_Inference_helper(Base_Inference_helper):
+    def __init__(self, top_k=1):
+        self.top_k = top_k
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        with open(input_file, 'rb') as f:
+            data = pickle.load(f)
+        self.input_file = input_file
+
+        left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
+        right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
+        ops = [
+            UniformSampleFrames(clip_len=48, num_clips=10, test_mode=True),
+            PoseDecode(),
+            PoseCompact(hw_ratio=1., allow_imgpad=True),
+            Resize(scale=(-1, 56)),
+            CenterCrop_V2(crop_size=56),
+            GeneratePoseTarget(sigma=0.6,
+                               use_score=True,
+                               with_kp=True,
+                               with_limb=False,
+                               double=True,
+                               left_kp=left_kp,
+                               right_kp=right_kp),
+            FormatShape(input_format='NCTHW'),
+            Collect(keys=['imgs', 'label'], meta_keys=[])
+        ]
+
+        for op in ops:
+            results = op(data)
+        results = [results[0][np.newaxis, :, :, :, :, :]]
+        self.num_segs = results[0].shape[1]
+        return results
+
+    def postprocess(self, outputs, print_output=True):
+        batch_size = outputs[0].shape[0]
+        cls_score = outputs[0].reshape(
+            [batch_size // self.num_segs, self.num_segs, outputs[0].shape[-1]])
+        output = F.softmax(paddle.to_tensor(cls_score),
+                           axis=2).mean(axis=1).numpy()
+        N = len(self.input_file)
+        for i in range(N):
+            classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]
+            classes = classes[np.argsort(-output[i, classes])]
+            scores = output[i, classes]
+            if print_output:
+                print("Current video file: {0}".format(self.input_file[i]))
+                for j in range(self.top_k):
+                    print("\ttop-{0} class: {1}".format(j + 1, classes[j]))
+                    print("\ttop-{0} score: {1}".format(j + 1, scores[j]))
+                  
+
+@INFERENCE.register()
+class YOWO_Inference_helper(Base_Inference_helper):
+
+    def __init__(self,
+                 num_seg=16,
+                 target_size=224,
+                 nms_thresh=0.5,
+                 conf_thresh_valid=0.5,
+                 mean=[0.4345, 0.4051, 0.3775],
+                 std=[0.2768, 0.2713, 0.2737]):
+        self.num_seg = num_seg
+        self.target_size = target_size
+        self.nms_thresh = nms_thresh
+        self.conf_thresh_valid = conf_thresh_valid
+        self.mean = mean
+        self.std = std
+
+    def preprocess(self, input_file):
+        """
+        input_file: str, file path
+        return: list
+        """
+        assert os.path.isfile(input_file) is not None, "{0} not exists".format(
+            input_file)
+        cap = cv2.VideoCapture(input_file)
+        queue = []
+        inputs = []
+        frames = []
+        while (cap.isOpened()):
+            ret, frame = cap.read()
+            if ret == False:
+                break
+            if len(queue) <= 0:  # At initialization, populate queue with initial frame
+                for i in range(self.num_seg):
+                    queue.append(frame)
+
+            # Add the read frame to last and pop out the oldest one
+            queue.append(frame)
+            queue.pop(0)
+
+            # Resize images
+            imgs = [cv2.resize(img, (self.target_size, self.target_size), interpolation=cv2.INTER_LINEAR) for img in
+                    queue]
+
+            # Convert image to CHW keeping BGR order.
+            imgs = [img.transpose([2, 0, 1]) for img in imgs]
+
+            # Image [0, 255] -> [0, 1].
+            imgs = [img / 255.0 for img in imgs]
+
+            imgs = [
+                np.ascontiguousarray(
+                    img.reshape((3, imgs[0].shape[1], imgs[0].shape[2]))
+                ).astype(np.float32)
+                for img in imgs
+            ]
+
+            # Concat list of images to single ndarray.
+            imgs = np.concatenate(
+                [np.expand_dims(img, axis=1) for img in imgs], axis=1
+            )
+
+            imgs = np.ascontiguousarray(imgs)
+            imgs = np.expand_dims(imgs, axis=0)
+            imgs = np.expand_dims(imgs, axis=0)
+            inputs.append(imgs)
+            frames.append(queue[-1])
+
+        return inputs, frames
+
+    def postprocess(self, outputs, frame, filename, save_img=True):
+        """
+        outputs: list
+        frames: list
+        """
+        labels = [
+            "Basketball", "BasketballDunk", "Biking", "CliffDiving", "CricketBowling",
+            "Diving", "Fencing", "FloorGymnastics", "GolfSwing", "HorseRiding",
+            "IceDancing", "LongJump", "PoleVault", "RopeClimbing", "SalsaSpin",
+            "SkateBoarding", "Skiing", "Skijet", "SoccerJuggling", "Surfing",
+            "TennisSwing", "TrampolineJumping", "VolleyballSpiking", "WalkingWithDog"]
+        nms_thresh = 0.5
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        for out in outputs:
+            out = paddle.to_tensor(out)
+            preds = []
+            all_boxes = get_region_boxes(out)
+            for i in range(out.shape[0]):
+                boxes = all_boxes[i]
+                boxes = nms(boxes, nms_thresh)
+
+                for box in boxes:
+                    x1 = round(float(box[0] - box[2] / 2.0) * 320.0)
+                    y1 = round(float(box[1] - box[3] / 2.0) * 240.0)
+                    x2 = round(float(box[0] + box[2] / 2.0) * 320.0)
+                    y2 = round(float(box[1] + box[3] / 2.0) * 240.0)
+
+                    det_conf = float(box[4])
+                    for j in range((len(box) - 5) // 2):
+                        cls_conf = float(box[5 + 2 * j].item())
+                        prob = det_conf * cls_conf
+                    preds.append([[x1, y1, x2, y2], prob, labels[int(box[6])]])
+
+            for _, dets in enumerate(preds):
+                if dets[1] < 0.4:
+                    break
+                text = dets[2] + ' ' + '{:.2f}'.format(dets[1])
+                cv2.rectangle(frame, (dets[0][0], dets[0][1]), (dets[0][2], dets[0][3]), (0, 255, 0), 2)
+                cv2.putText(frame, text, (dets[0][0] + 3, dets[0][1] - 5 - 10 * _), font, 0.5, (0, 255, 0), 2)
+            cv2.imwrite('{}.jpg'.format(filename), frame)
\ No newline at end of file
diff --git a/Bank_second_part/detect_process/tools/wheel.py b/Bank_second_part/detect_process/tools/wheel.py
new file mode 100644
index 0000000..77281be
--- /dev/null
+++ b/Bank_second_part/detect_process/tools/wheel.py
@@ -0,0 +1,354 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+
+__dir__ = os.path.dirname(__file__)
+sys.path.append(os.path.join(__dir__, ''))
+
+import numpy as np
+import tarfile
+import requests
+from tqdm import tqdm
+import shutil
+
+from paddle import inference
+from paddle.inference import Config, create_predictor
+
+from tools.utils import ppTSM_Inference_helper
+
+__all__ = ['PaddleVideo']
+
+# path of download model and data
+BASE_DIR = os.path.expanduser("~/.paddlevideo_inference/")
+BASE_INFERENCE_MODEL_DIR = os.path.join(BASE_DIR, 'inference_model')
+BASE_VIDEOS_DIR = os.path.join(BASE_DIR, 'videos')
+
+# support Models
+MODELS = {
+    'ppTSM':
+    'https://videotag.bj.bcebos.com/PaddleVideo/InferenceModel/ppTSM_infer.tar',
+    'ppTSM_v2':
+    'https://videotag.bj.bcebos.com/PaddleVideo/InferenceModel/ppTSM_v2_infer.tar'
+}
+
+MODEL_NAMES = list(MODELS.keys())
+
+
+def parse_args(mMain=True, add_help=True):
+    """
+    Args:
+        mMain: bool. True for command args, False for python interface
+    """
+    import argparse
+
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+
+    if mMain == True:
+
+        # general params
+        parser = argparse.ArgumentParser(add_help=add_help)
+        parser.add_argument("--model_name", type=str, default='')
+        parser.add_argument("-v", "--video_file", type=str, default='')
+        parser.add_argument("--use_gpu", type=str2bool, default=True)
+
+        # params for decode and sample
+        parser.add_argument("--num_seg", type=int, default=16)
+
+        # params for preprocess
+        parser.add_argument("--short_size", type=int, default=256)
+        parser.add_argument("--target_size", type=int, default=224)
+
+        # params for predict
+        parser.add_argument("--model_file", type=str, default='')
+        parser.add_argument("--params_file", type=str)
+        parser.add_argument("-b", "--batch_size", type=int, default=1)
+        parser.add_argument("--use_fp16", type=str2bool, default=False)
+        parser.add_argument("--ir_optim", type=str2bool, default=True)
+        parser.add_argument("--use_tensorrt", type=str2bool, default=False)
+        parser.add_argument("--gpu_mem", type=int, default=8000)
+        parser.add_argument("--top_k", type=int, default=1)
+        parser.add_argument("--enable_mkldnn", type=bool, default=False)
+        parser.add_argument("--label_name_path", type=str, default='')
+
+        return parser.parse_args()
+
+    else:
+        return argparse.Namespace(model_name='',
+                                  video_file='',
+                                  use_gpu=True,
+                                  num_seg=16,
+                                  short_size=256,
+                                  target_size=224,
+                                  model_file='',
+                                  params_file='',
+                                  batch_size=1,
+                                  use_fp16=False,
+                                  ir_optim=True,
+                                  use_tensorrt=False,
+                                  gpu_mem=8000,
+                                  top_k=1,
+                                  enable_mkldnn=False,
+                                  label_name_path='')
+
+
+def parse_file_paths(input_path: str) -> list:
+    if os.path.isfile(input_path):
+        files = [
+            input_path,
+        ]
+    else:
+        files = os.listdir(input_path)
+        files = [
+            file for file in files
+            if (file.endswith(".avi") or file.endswith(".mp4"))
+        ]
+        files = [os.path.join(input_path, file) for file in files]
+    return files
+
+
+def download_with_progressbar(url, save_path):
+    response = requests.get(url, stream=True)
+    total_size_in_bytes = int(response.headers.get('content-length', 0))
+    block_size = 1024  # 1 Kibibyte
+    progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
+    with open(save_path, 'wb') as file:
+        for data in response.iter_content(block_size):
+            progress_bar.update(len(data))
+            file.write(data)
+    progress_bar.close()
+    if total_size_in_bytes == 0 or progress_bar.n != total_size_in_bytes:
+        raise Exception("Something went wrong while downloading models")
+
+
+def download_inference_model(model_storage_directory, url):
+    # using custom model
+    tar_file_name_list = [
+        'inference.pdiparams', 'inference.pdiparams.info', 'inference.pdmodel'
+    ]
+    if not os.path.exists(
+            os.path.join(model_storage_directory,
+                         'inference.pdiparams')) or not os.path.exists(
+                             os.path.join(model_storage_directory,
+                                          'inference.pdmodel')):
+        tmp_path = os.path.join(model_storage_directory, url.split('/')[-1])
+        print('download {} to {}'.format(url, tmp_path))
+        os.makedirs(model_storage_directory, exist_ok=True)
+        download_with_progressbar(url, tmp_path)  #download
+
+        #save to directory
+        with tarfile.open(tmp_path, 'r') as tarObj:
+            for member in tarObj.getmembers():
+                filename = None
+                for tar_file_name in tar_file_name_list:
+                    if tar_file_name in member.name:
+                        filename = tar_file_name
+                if filename is None:
+                    continue
+                file = tarObj.extractfile(member)
+                with open(os.path.join(model_storage_directory, filename),
+                          'wb') as f:
+                    f.write(file.read())
+        os.remove(tmp_path)
+
+
+def create_paddle_predictor(args):
+    config = Config(args.model_file, args.params_file)
+
+    if args.use_gpu:
+        config.enable_use_gpu(args.gpu_mem, 0)
+    else:
+        config.disable_gpu()
+        if args.enable_mkldnn:
+            # cache 10 different shapes for mkldnn to avoid memory leak
+            config.set_mkldnn_cache_capacity(10)
+            config.enable_mkldnn()
+
+    config.disable_glog_info()
+    config.switch_ir_optim(args.ir_optim)  # default true
+    if args.use_tensorrt:
+        config.enable_tensorrt_engine(
+            precision_mode=Config.Precision.Half
+            if args.use_fp16 else Config.Precision.Float32,
+            max_batch_size=args.batch_size)
+
+    config.enable_memory_optim()
+    # use zero copy
+    config.switch_use_feed_fetch_ops(False)
+    predictor = create_predictor(config)
+
+    return predictor
+
+
+def load_label_name_dict(path):
+    result = {}
+    if not os.path.exists(path):
+        print(
+            'Warning: If want to use your own label_dict, please input legal path!\nOtherwise label_names will be empty!'
+        )
+    else:
+        for line in open(path, 'r'):
+            partition = line.split('\n')[0].partition(' ')
+            try:
+                result[int(partition[0])] = str(partition[-1])
+            except:
+                result = {}
+                break
+    return result
+
+
+class PaddleVideo(object):
+    def __init__(self, **kwargs):
+        print(
+            '\nInference models that Paddle provides are listed as follows:\n{}'
+            .format(MODEL_NAMES), '\n')
+        process_params = parse_args(mMain=False, add_help=False)
+        process_params.__dict__.update(**kwargs)
+
+        if not os.path.exists(process_params.model_file):
+            if process_params.model_name is None:
+                raise Exception('Please input model name that you want to use!')
+            if process_params.model_name in MODEL_NAMES:
+                url = MODELS[process_params.model_name]
+                download_path = os.path.join(BASE_INFERENCE_MODEL_DIR,
+                                             process_params.model_name)
+                if not os.path.exists(download_path):
+                    os.makedirs(download_path)
+
+                #create pretrained model download_path
+                download_inference_model(model_storage_directory=download_path,
+                                         url=url)
+
+                process_params.model_file = os.path.join(
+                    download_path, 'inference.pdmodel')
+                process_params.params_file = os.path.join(
+                    download_path, 'inference.pdiparams')
+                process_params.label_name_path = os.path.join(
+                    __dir__, '../data/k400/Kinetics-400_label_list.txt')
+            else:
+                raise Exception(
+                    'If you want to use your own model, Please input model_file as model path!'
+                )
+        else:
+            print('Using user-specified model and params!')
+        print("process params are as follows: \n{}".format(process_params))
+        self.label_name_dict = load_label_name_dict(
+            process_params.label_name_path)
+
+        self.args = process_params
+        self.predictor = create_paddle_predictor(process_params)
+
+    def predict(self, video):
+        """
+        predict label of video with paddlevideo
+        Args:
+            video:input video for clas, support single video , internet url, folder path containing series of videos
+        Returns:
+            list[dict:{videoname: "",class_ids: [], scores: [], label_names: []}],if label name path is None,label names will be empty
+        """
+        video_list = []
+        assert isinstance(video, (str))
+
+        # get input_tensor and output_tensor
+        input_names = self.predictor.get_input_names()
+        output_names = self.predictor.get_output_names()
+        input_tensor_list = []
+        output_tensor_list = []
+        for item in input_names:
+            input_tensor_list.append(self.predictor.get_input_handle(item))
+        for item in output_names:
+            output_tensor_list.append(self.predictor.get_output_handle(item))
+
+        if isinstance(video, str):
+            # download internet video
+            if video.startswith('http'):
+                if not os.path.exists(BASE_VIDEOS_DIR):
+                    os.makedirs(BASE_VIDEOS_DIR)
+                video_path = os.path.join(BASE_VIDEOS_DIR, 'tmp.mp4')
+                download_with_progressbar(video, video_path)
+                print("Current using video from Internet:{}, renamed as: {}".
+                      format(video, video_path))
+                video = video_path
+            files = parse_file_paths(video)
+        else:
+            print('Please input legal video!')
+
+        # Inferencing process
+        InferenceHelper = ppTSM_Inference_helper(
+            num_seg=self.args.num_seg,
+            short_size=self.args.short_size,
+            target_size=self.args.target_size,
+            top_k=self.args.top_k)
+        batch_num = self.args.batch_size
+        for st_idx in range(0, len(files), batch_num):
+            ed_idx = min(st_idx + batch_num, len(files))
+
+            # Pre process batched input
+            batched_inputs = InferenceHelper.preprocess_batch(
+                files[st_idx:ed_idx])
+
+            # run inference
+            for i in range(len(input_tensor_list)):
+                input_tensor_list[i].copy_from_cpu(batched_inputs[i])
+            self.predictor.run()
+
+            batched_outputs = []
+            for j in range(len(output_tensor_list)):
+                batched_outputs.append(output_tensor_list[j].copy_to_cpu())
+
+            results_list = InferenceHelper.postprocess(batched_outputs,
+                                                       print_output=False,
+                                                       return_result=True)
+
+            for res in results_list:
+                classes = res["topk_class"]
+                label_names = []
+                if len(self.label_name_dict) != 0:
+                    label_names = [self.label_name_dict[c] for c in classes]
+                res["label_names"] = label_names
+
+                print("Current video file: {0}".format(res["video_id"]))
+                print("\ttop-{0} classes: {1}".format(len(res["topk_class"]),
+                                                      res["topk_class"]))
+                print("\ttop-{0} scores: {1}".format(len(res["topk_scores"]),
+                                                     res["topk_scores"]))
+                print("\ttop-{0} label names: {1}".format(
+                    len(res["label_names"]), res["label_names"]))
+
+
+def main():
+    # for cmd
+    args = parse_args(mMain=True)
+    clas_engine = PaddleVideo(**(args.__dict__))
+    clas_engine.predict(args.video_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/Bank_second_part/detect_process/tools_function.py b/Bank_second_part/detect_process/tools_function.py
new file mode 100644
index 0000000..14eb9a3
--- /dev/null
+++ b/Bank_second_part/detect_process/tools_function.py
@@ -0,0 +1,353 @@
+import cv2
+import os
+
+
+
+# 图像文件夹
+def get_video_list(path):
+        video_ext = [".mp4", ".avi",".MP4"]
+        video_names = []
+        for maindir, subdir, file_name_list in os.walk(path):
+            for filename in file_name_list:
+                apath = os.path.join(maindir, filename)
+                ext = os.path.splitext(apath)[1]
+                if ext in video_ext:
+                    video_names.append(apath)
+        return video_names
+
+
+# # 截取裁剪需要的视频帧
+# def save_seg_video(video_name, frameToStart, frametoStop, videoWriter, bbox):
+#     cap = video_name
+#     cap.set(cv2.CAP_PROP_POS_FRAMES, frameToStart)  # 设置初始帧数
+#     count = frameToStart
+    
+#     while True:
+#         success, frame = cap.read()
+
+#         if not success or count > frametoStop:
+#             break
+
+#         if count >= frameToStart:
+#             # 裁剪视频画面
+#             frame_target = frame[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])]
+#             videoWriter.write(frame_target)
+
+#         count += 1
+
+#     cap.release()
+
+    # 截取裁剪需要的视频帧
+def save_seg_video(video_name,frameToStart,frametoStop,videoWriter,bbox):
+
+    cap = cv2.VideoCapture(video_name)
+    count = 0
+    while True:
+        success,frame = cap.read()
+        if success:
+            count += 1
+            if count <= frametoStop and count > frameToStart:  # 选取起始帧
+                # print('correct= ', count)
+                
+                #裁剪视频画面
+                frame_target = frame[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])]  # (split_height, split_width)
+                frame_target = cv2.resize(frame_target,(200,200))
+
+                videoWriter.write(frame_target)
+                    
+        if not success or count >= frametoStop:
+            break
+
+    videoWriter.release()
+    cap.release()
+
+ 
+
+
+# 获得字典中所有values值（这个值是列表）
+def get_dict_values(lst):
+        """
+        获取列表中所有字典的 values 值（如果值是列表）
+        
+        参数:
+        lst: 包含字典的列表
+        
+        返回值:
+        values: 包含所有字典的 values 值的列表（如果值是列表）
+        """
+        return [value for dictionary in lst for value in dictionary.values() if isinstance(value, list)]
+
+
+    # 解析检测后的结果，为检测后的结果排序
+def analysis_sort_list(re_list):
+
+        # print('result_dict:',result_dict)
+
+        # 获得检测列表
+        # re_list = result_dict['start_bbox']
+        # print('re_list:',re_list)
+
+        # 获得列表中所有字典的values值
+        # re_bbox_list = Process_tools.get_dict_values(re_list)
+
+        # 为检测出来的标注框排序
+        sorted_lst = sorted(re_list, key=lambda x: x[0])
+
+        return sorted_lst
+
+
+    #对比重叠率高的两个部分，并结合标注框，保存最大的标注框
+def contrast_bbox(e_bbox,r_bbox):
+
+    e_bbox_min = e_bbox[:2]
+    r_bbox_min = r_bbox[:2]
+
+    bbox_min = [min(x, y) for x, y in zip(e_bbox_min, r_bbox_min)]
+
+    e_bbox_max = e_bbox[-2:]
+    r_bbox_max = r_bbox[-2:]
+
+    bbox_max = [max(x, y) for x, y in zip(e_bbox_max, r_bbox_max)]
+
+    bbox = bbox_min + bbox_max
+
+    return bbox
+
+
+    # 解析result_list列表
+def analysis_re01_list(example_list,result_list):
+
+    '''
+    example_dict:对比的参照
+    result_dict: 需要与参照对比的结果
+
+    example_sorted_lst：返回值中，原先有现在没有部分
+    re_dict_sorted_lst：返回值中，现在有原先没有部分
+
+    cut_list：原先有，现在也有的部分
+
+    '''
+    # 第一次检测到目标的帧率和信息
+    # example_dict_fps = list(example_dict.keys())[0]
+    # example_sorted_lst = Process_tools.analysis_sort_list(example_list)
+
+    # 当前帧检测结果中所有的检测结果数值
+    # re_dict_fps = list(result_dict.keys())[0]
+    # re_dict_sorted_lst = Process_tools.analysis_sort_list(result_list)
+
+    # 保存前后帧率连续的范围、筛选出相同的部分
+    cut_list = []
+    example_temp = []
+    re_temp = []
+
+    for i,ex_bbox_dict in enumerate(example_list):
+
+        ex_bbox = ex_bbox_dict['result']
+
+        for j,re_bbox in enumerate(result_list):
+
+            iou = calculate_iou(box1=ex_bbox, box2=re_bbox)
+
+                # print(iou)
+                
+            if iou > 0.5:
+
+                # bbox = Process_tools.contrast_bbox(e_bbox=ex_bbox,r_bbox=re_bbox)
+
+                # cut_list.append({i:re_bbox})
+                cut_list.append(re_bbox)
+                example_temp.append(ex_bbox)
+                re_temp.append(re_bbox)
+
+                break
+        
+    # print('example_temp:',example_temp)
+    # print('re_temp:',re_temp)
+    example_sorted_lst = [item for item in example_list if item['result'] not in example_temp]
+    re_dict_sorted_lst = [item for item in result_list if item not in re_temp]
+
+    return cut_list,example_sorted_lst,re_dict_sorted_lst
+
+
+# 计算前后帧率重叠范围
+def calculate_iou(box1, box2):
+    """
+    计算两个边界框之间的IoU值
+        
+    参数:
+    box1: 边界框1的坐标（x1, y1, x2, y2）
+    box2: 边界框2的坐标（x1, y1, x2, y2）
+        
+    返回值:
+    iou: 两个边界框之间的IoU值
+    """
+    x1 = max(box1[0], box2[0])
+    y1 = max(box1[1], box2[1])
+    x2 = min(box1[2], box2[2])
+    y2 = min(box1[3], box2[3])
+        
+    # 计算交集区域面积
+    intersection_area = max(0, x2 - x1 + 1) * max(0, y2 - y1 + 1)
+        
+    # 计算边界框1和边界框2的面积
+    box1_area = (box1[2] - box1[0] + 1) * (box1[3] - box1[1] + 1)
+    box2_area = (box2[2] - box2[0] + 1) * (box2[3] - box2[1] + 1)
+        
+    # 计算并集区域面积
+    union_area = box1_area + box2_area - intersection_area
+        
+    # 计算IoU值
+    iou = intersection_area / union_area
+        
+    return iou
+
+# 修正坐标参数
+def para_correction(images_size,bbox,dertpara):
+
+        '''
+        修正检测后标注框过小的情况,如果有修正参数则使用修正参数，如果没有就按照坐标值扩大两倍
+
+        '''
+        
+    # if dertpara:
+    #     pass
+    # else:
+        w = (bbox[2] - bbox[0]) / int(dertpara)
+        h = (bbox[3] - bbox[1]) / int(dertpara)
+
+        bbox_extand_list_x = [bbox[0] - w,bbox[2] + w]
+        bbox_extand_list_y = [bbox[1] - h,bbox[3] + h]
+            
+        bbox_list_x = contrast(size=images_size[0],bbox_extand_list=bbox_extand_list_x)
+        bbox_list_y = contrast(size=images_size[1],bbox_extand_list=bbox_extand_list_y)
+
+        bbox_list = [bbox_list_x[0],bbox_list_y[0],bbox_list_x[1],bbox_list_y[1]]
+
+        return bbox_list
+        
+
+def para_list_correction(images_size,bbox_list,dertpara):
+
+    updata_result_list = []
+
+    for bbox in bbox_list:
+
+        updata_bbox = para_correction(images_size,bbox,dertpara)
+
+        updata_result_list.append(updata_bbox)
+        
+    return updata_result_list
+
+# 对比数值是否在这个范围内
+def contrast(size,bbox_extand_list):
+
+    '''
+    对比数值是否在这个范围内
+    '''
+    
+    # print('bbox_extand_list:',bbox_extand_list)
+    # print('size:',size)
+    bbox_list = []
+
+    for x in bbox_extand_list:
+
+        # print('size:',size)
+            
+        if 0 <= int(x) <= int(size):
+            # print('in:',x,size)
+            bbox_list.append(x)
+        if int(x) > int(size):
+            # print('>:',x,size)
+            bbox_list.append(size)
+        if int(x) < 0:
+            # print('<:',x,size)
+            bbox_list.append(0)
+    
+    # print('bbox_list:',bbox_list)
+
+    return bbox_list
+
+
+def change_list_dict(fps1,re_list):
+
+    '''
+    给列表的结果设置对应帧率
+    '''
+
+    bbox_list_all = []
+
+    for bbox_list in re_list:
+
+        bbox_dict = {'fps':fps1,'result':bbox_list}
+        bbox_list_all.append(bbox_dict)
+        
+    return bbox_list_all
+
+
+def statistics_fps(fps_now,re_list,parameter):
+
+    '''
+    统计时长，返回时间满足截取要求的目标坐标
+    '''
+
+    time_out_list = []
+
+    for bbox_dict in re_list:
+
+        con_fps = int(fps_now) - int(bbox_dict["fps"])
+
+        if con_fps > parameter:
+
+            time_out_list.append(bbox_dict)
+    
+    return time_out_list
+
+
+def change_dict_list(dict_list):
+    '''
+    从字典列表得到bbox列表
+    '''
+
+    bbox_list = []
+
+    for dicts1 in dict_list:
+
+        bbox_list.append(dicts1['result'])
+        
+    return bbox_list
+
+
+def select_list(result_list):
+
+    '''
+    筛选列表中的空列表
+    '''
+    if result_list:
+        result_only = []
+
+        for result in result_list:
+
+            if result == None :
+                pass
+            else:
+
+                # result_bbox = select_bbox(result)
+                result_only.append(result)
+            
+        return result_only
+
+def select_bbox(bbox_list):
+
+    # bbox_list_return = []
+
+    # print('bbox:',bbox_list)
+    left_top = [min(bbox_list, key=lambda p: p[0])[0], min(bbox_list, key=lambda p: p[1])[1]]
+    right_bottom = [max(bbox_list, key=lambda p: p[0])[0], max(bbox_list, key=lambda p: p[1])[1]]
+
+    bbox_list_return = left_top + right_bottom
+
+    
+    # print('bbox_list:',bbox_list_return)
+ 
+    return bbox_list_return
+    
\ No newline at end of file
diff --git a/Bank_second_part/detect_process/video_process.py b/Bank_second_part/detect_process/video_process.py
index c928a68..5c6ced6 100644
--- a/Bank_second_part/detect_process/video_process.py
+++ b/Bank_second_part/detect_process/video_process.py
@@ -1,4 +1,3 @@
-import numpy as np
 import cv2
 import os
 import time
@@ -11,8 +10,11 @@ import threading
 from config import Q_SZ
 
 from personDet import analysis_yolov8
-import tools
+import tools_function
 from holisticDet import MediapipeProcess
+import mediapipe_detection_image
+from PP_TSMv2_infer import PP_TSMv2_predict
+import shutil
 
 
 
@@ -31,24 +33,25 @@ class DealVideo():
 
         self.person_model = person_model
         self.mediapipe_model = mediapipe_model
-        self.pptsmv2_model = pptsmv2_model
+        self.predictor = pptsmv2_model[1]
+        self.infer = pptsmv2_model[0]
+        self.batch_size = 1
 
-        # 图片检测后队列
+        # 队列
         self.videoQueue = queue.Queue(maxsize=Q_SZ)
-        self.frameQueue = queue.Queue(maxsize=0)
+        self.videoQueue2 = queue.Queue(maxsize=Q_SZ)
         self.cutbboxQueue = queue.Queue(maxsize=0)
-        self.videoframeQueue = queue.Queue(maxsize=0)
-        self.videohandsQueue = queue.Queue(maxsize=0)
-        self.videoheadQueue = queue.Queue(maxsize=0)
-        self.videopersonQueue = queue.Queue(maxsize=0)
+        self.videodetQueue = queue.Queue(maxsize=0)
+        self.videoQueue3 = queue.Queue(maxsize=0)
 
         #线程
         self.get_video_listThread = threading.Thread(target=self.get_video_list)
         self.get_video_frameThread = threading.Thread(target=self.get_video_frame)
-        self.person_detThread = threading.Thread(target=self.person_det)
         self.write_videoThread = threading.Thread(target=self.write_video)
-        self.select_video_frameThread = threading.Thread(target=self.select_video_frame)
         self.head_hands_detThread = threading.Thread(target=self.head_hands_det)
+        self.video_select_dectThread = threading.Thread(target=self.video_select_dect)
+        self.select_video_pathThread = threading.Thread(target=self.select_video_path)
+        
         
 
     def get_video_list(self):
@@ -70,173 +73,268 @@ class DealVideo():
         else:
             self.videoQueue.put(self.video_file)
 
-    # def cut_video_seg(self):
-
-    #     pass
-
     def get_video_frame(self):
 
         '''
         对视频进行分帧、每一帧都保存队列
         '''
 
-
         while True:
-            if ~self.videoQueue.empty():
-                
-                try:
-                    video_path = self.videoQueue.get()  
-                    
 
-                    # video_basename = os.path.basename(video_path).split('.')[0]
+            if self.videoQueue.empty():
 
-                    cap = cv2.VideoCapture(video_path)
+                time.sleep(1)
+            
+            else:
+                
+                t1 = time.time()
+                video_path = self.videoQueue.get()  
+                
+                # video_basename = os.path.basename(video_path).split('.')[0]
 
-                    frame_list = []
-                    count_fps = 0
+                print('video_path:',video_path)
 
-                    while cap.isOpened():
-                        success, frame = cap.read()
-                        if not success:
-                            print(video_path,"Ignoring empty camera frame.")
-                            break
-                        count_fps  += 1
-                        # print('count_fps_read_video=',count_fps)
+                cap = cv2.VideoCapture(video_path)
+                video_fps = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
 
-                        frame_dict = {'fps':count_fps,'frame':frame}
-                        frame_list.append(frame_dict)
-                        
+                # frame_list = []
+                count_fps = 0
+                frame_result_contact = []
+                count_fps_del = 0
 
-                    video_dict = {'video_path':video_path,'frame_list':frame_list}
-                
-                    self.frameQueue.put(video_dict)
-                    # time.sleep(30)
+                while cap.isOpened():
+                    success, frame = cap.read()
+                    if not success:
+                        print(video_path,"Ignoring empty camera frame.")
+                        print('video_fps:',video_fps,'count_fps:',count_fps)
+                        break
 
+                    
+                    # print('count_fps_read_video=',count_fps)
+                    imgsize = frame.shape
 
-                except Exception as e:
-                    print(e)
+                    person_det = analysis_yolov8(frame=frame,
+                                                     model_coco=self.person_model,
+                                                     confidence_set=0.5)                    
 
+                    person_list = tools_function.get_dict_values(person_det)
 
-    def person_det(self):
+                    if frame_result_contact:
+                            start_fps = frame_result_contact[0]['fps']
+                        
+                    else:
+                            start_fps = count_fps
 
-        '''
-        从队列中获取视频帧frame，进行第一步人员的检测
-        '''
+                    if count_fps == (video_fps - 1):
 
-        while True: 
+                        video_end = True
 
-            if  ~self.videoframeQueue.empty():
+                    else:
 
-                video_frame_dict = self.videoframeQueue.get()
+                        video_end = False
 
-                frame_list = video_frame_dict['frame_list']
-                video_path = video_frame_dict['video_path']
+                    if person_list:
 
-                frame_result_contact = []
+                        count_fps_del_re,updata_result_contact = self.analysis_by_bbox(imgsize=imgsize,
+                                                                                          detect_result=person_list,
+                                                                                          dertpara=10,
+                                                                                          start_fps=start_fps,
+                                                                                          now_fps=count_fps,
+                                                                                          label_name='person',
+                                                                                          video_path=video_path,
+                                                                                          frame_result_contact=frame_result_contact,
+                                                                                          parameter_fps=200,
+                                                                                          count_fps_del=count_fps_del,
+                                                                                          video_end=video_end
+                                                                                          )
+                        count_fps_del = count_fps_del_re
+                        frame_result_contact = updata_result_contact
 
-                for i in range(len(frame_list)):
+                    count_fps  += 1
 
-                    if frame_list[i]["fps"] == i + 1:                
 
-                        person_det = analysis_yolov8(frame=frame_list[i]['frame'],
-                                                     model_coco=self.person_model,
-                                                     confidence_set=0.5)
-                        
-                        # 当前帧检测的结果列表，只包含bboxlist
-                        person_list = tools.get_dict_values(person_det)
 
-                        label_name = list(person_det[0].keys())[0]
+    def head_hands_det(self):
 
-                        update_frame_result_contact = self.get_cut_message(fps1=frame_list[i]["fps"],
-                                                                            label_name = label_name,
-                                                                            re_list=person_list,
-                                                                            video_path=video_path,
-                                                                            frame_result_contact=frame_result_contact)
-                        
-                        frame_result_contact = update_frame_result_contact
-                        # print('frame_result_contact:',frame_result_contact)
+        print('head_hands_detaohgaogh')
 
+        while True:
 
-    def head_hands_det(self):
+            if self.videoQueue3.empty():
 
-        while True:
+                time.sleep(1)
+            else:
 
-            if ~self.videopersonQueue.empty():
+                t0 = time.time()
+                video_path = self.videoQueue3.get()  
 
-                person_frame_dict = self.videopersonQueue.get()
 
+                print('video_path_head_hands_det:',video_path)
 
-                person_frame_list = person_frame_dict['frame_list']
-                video_path = person_frame_dict['video_path']
+                cap = cv2.VideoCapture(video_path)
+                video_fps = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+                # frame_list = []
+                count_fps = 0
 
                 head_result_contact = []
                 hands_result_contact = []
+                count_fps_del_head = 0
+                count_fps_del_hand = 0
 
-                for i in range(len(person_frame_list)):
+                while cap.isOpened():
+                    success, frame = cap.read()
+                    if not success:
+                        print(video_path,"Ignoring empty camera frame.")
+                        print('count_fps:',count_fps,'video_fps:',video_fps)
+                        break
 
-                    if person_frame_list[i]["fps"] == i + 1: 
+                    # print('count_fps_read_video=',count_fps)
+                    imgsize = frame.shape
 
-                        image = person_frame_list[i]["frame"]
+                    # 模型推理
+                    hh_result = MediapipeProcess.mediapipe_det(image=frame,
+                                                                        holistic=self.mediapipe_model)
+                    hh_result_dict = MediapipeProcess.get_analysis_result(image=frame,results=hh_result)
 
-                        imgsize = image.shape
+                    # # 获得当前坐标列表
+                    head_result = hh_result_dict['face_bbox']
+                    head_result_1 = tools_function.select_list(head_result)
+                    hands_result = hh_result_dict['hand_bbox']
+                    hands_result_1 = tools_function.select_list(hands_result)
 
-                        # print(type(image))
+                    
+                    if count_fps == (video_fps - 1):
+
+                        print('count_fps:',count_fps,'video_fps:',video_fps)
+
+                        video_end = True
+                    else:
+
+                        video_end = False
+
+                    # 统一修正坐标,分别对头和手进行分析
+                    if head_result_1:
+
+                        if head_result_contact:
+                            start_fps = head_result_contact[0]['fps']
+                        else:
+                            start_fps = count_fps
+                        count_fps_del_re,updata_result_contact = self.analysis_by_bbox(imgsize=imgsize,
+                                                                                          detect_result=head_result_1,
+                                                                                          dertpara=1,
+                                                                                          start_fps=start_fps,
+                                                                                          now_fps=count_fps,
+                                                                                          label_name='head',
+                                                                                          video_path=video_path,
+                                                                                          frame_result_contact=head_result_contact,
+                                                                                          parameter_fps=50,
+                                                                                          count_fps_del=count_fps_del_head,
+                                                                                          video_end=video_end
+                                                                                          )
+                        count_fps_del_head = count_fps_del_re
+                        head_result_contact = updata_result_contact
+
+                    if hands_result_1:
+
+                        if hands_result_contact:
+                            start_fps = hands_result_contact[0]['fps']
+                        else:
+                            start_fps = count_fps
+
+                        count_fps_del_re,updata_result_contact = self.analysis_by_bbox(imgsize=imgsize,
+                                                                                          detect_result=hands_result_1,
+                                                                                          dertpara=2,
+                                                                                          start_fps=start_fps,
+                                                                                          now_fps=count_fps,
+                                                                                          label_name='hands',
+                                                                                          video_path=video_path,
+                                                                                          frame_result_contact=hands_result_contact,
+                                                                                          parameter_fps=50,
+                                                                                          count_fps_del=count_fps_del_hand,
+                                                                                          video_end=video_end
+                                                                                          )
+                        count_fps_del_hand = count_fps_del_re
+                        hands_result_contact = updata_result_contact
+
+                    count_fps  += 1
+
+
+    def video_select_dect(self):
 
-                        # 模型推理
-                        hh_result = MediapipeProcess.mediapipe_det(image=image,
-                                                                        holistic=self.mediapipe_model)
-                        hh_result_dict = MediapipeProcess.get_analysis_result(image=image,results=hh_result)
+        while True:
 
+            if self.videodetQueue.empty():
+                time.sleep(5)
+            else:
 
-                        
-                        # 获得当前坐标列表
-                        head_result = hh_result_dict['face_bbox']
-                        head_result_1 = tools.select_list(head_result)
-                        hands_result = hh_result_dict['hand_bbox']
-                        hands_result_1 = tools.select_list(hands_result)
-
-                        print('head_result_1:',head_result_1)
-                        print('head_result_1:',hands_result_1)
-
-                        # 统一修正坐标,分别对头和手进行分析
-                        if head_result_1:
-                            head_bbox_list = tools.para_list_correction(images_size=imgsize,
-                                                                            bbox_list=head_result_1,
-                                                                            dertpara=[])
-
-
-                            update_head_result_contact = self.get_cut_message(fps1=person_frame_list[i]["fps"],
-                                                                                label_name = 'head',
-                                                                                re_list=head_bbox_list,
-                                                                                video_path=video_path,
-                                                                                frame_result_contact=head_result_contact)
-                            head_result_contact = update_head_result_contact
-
-
-                        if hands_result_1:
-
-                            hands_bbox_list = tools.para_list_correction(images_size=imgsize,
-                                                                            bbox_list=hands_result_1,
-                                                                            dertpara=[]) 
-                           
-                            update_hands_result_contact = self.get_cut_message(fps1=person_frame_list[i]["fps"],
-                                                                                label_name = 'hands',
-                                                                                re_list=hands_bbox_list,
-                                                                                video_path=video_path,
-                                                                                frame_result_contact=hands_result_contact)
-                        
-                            hands_result_contact = update_hands_result_contact
+                video_path = self.videodetQueue.get()
 
-                        # print("head_result_contact:",head_result_contact)
-                        # print("hands_result_contact:",hands_result_contact)
+                try:
 
+                    result_list = PP_TSMv2_predict().predict(input_f=video_path,
+                                                            batch_size=self.batch_size,
+                                                            predictor=self.predictor,
+                                                            InferenceHelper=self.infer)
+                            
+                    video_base_name = os.path.basename(video_path)
+                    video_save_select_path = self.video_save_file + '/' + 'video_select_dect/'+ str(result_list['topk_class']) 
+                    os.makedirs(video_save_select_path, exist_ok=True)
+                    video_save = os.path.join(video_save_select_path, video_base_name)
                         
-    def get_cut_message(self,fps1,label_name,re_list,video_path,frame_result_contact):
+                    os.rename(video_path, video_save)
+
+                    print("result_list_video_select_dect:",result_list)
+                
+                except Exception as e:
+                    print(e)
+
+    def analysis_by_bbox(self,imgsize,detect_result,dertpara,start_fps,now_fps,label_name,video_path,frame_result_contact,parameter_fps,count_fps_del,video_end):
+        '''
+        imgsize:图片的尺寸,
+        detect_result:检测到的图像的结果,bboxlist
+        dertpara:标注框修正参数,取整数,2,3
+        start_fps: 对比列表中的起始帧
+        now_fpsl:当前帧率
+        label_name:用于分析的检测类别
+        video_path:视频路径
+        frame_result_contact:对比列表
+        parameter_fps:统计截止时间
+        count_fps_del:统计前后帧未出现次数
+        
+        '''
+
+        bbox_list = tools_function.para_list_correction(images_size=imgsize,
+                                                            bbox_list=detect_result,
+                                                            dertpara=dertpara)
+
+        count_fps_del_re,update_frame_result_contact = self.get_cut_message(fps1=now_fps,
+                                                                        label_name = label_name,
+                                                                        re_list=bbox_list,
+                                                                        video_path=video_path,
+                                                                        frame_result_contact=frame_result_contact,
+                                                                        parameter_fps=parameter_fps,
+                                                                        count_fps_del=count_fps_del,
+                                                                        video_end=video_end)
+        
+        # count_fps_del_re,updata_result_contact = self.get_continue_keys(count_fps_del=count_fps_del,
+        #                                                              continue_para=continue_para,
+        #                                                              start_fps=start_fps,
+        #                                                              now_fps=now_fps,
+        #                                                              frame_result_contact=frame_result_contact,
+        #                                                              update_frame_result_contact=update_frame_result_contact)\
+        
+        return count_fps_del_re,update_frame_result_contact
+    
+
+    def get_cut_message(self,fps1,label_name,re_list,video_path,frame_result_contact,parameter_fps,count_fps_del,video_end):
+
+        # continue_para = False
 
 
         if not frame_result_contact:
 
-            bbox_list_all = tools.change_list_dict(fps1=fps1,re_list=re_list)
+            bbox_list_all = tools_function.change_list_dict(fps1=fps1,re_list=re_list)
 
             frame_result_contact = bbox_list_all
             # print("frame_result_contact:",frame_result_contact)
@@ -244,41 +342,45 @@ class DealVideo():
         else:
 
             example_dict_list = frame_result_contact
-            print('example_dict_list:',example_dict_list)
-            print('re_list:',re_list)
 
-            cut_list,example_lst,re_dict_lst = tools.analysis_re01_list(example_list=example_dict_list,
+            cut_list,example_lst,re_dict_lst = tools_function.analysis_re01_list(example_list=example_dict_list,
                                                                                 result_list=re_list)
-                            
-            # print('cut_list:',cut_list)
-            # print('example_sorted_lst:',example_lst)
-            # print('re_dict_sorted_lst:',re_dict_lst)
-
-                        
             # 有目标减少情况
             if example_lst:
-
+  
                 # 截图保存视频
+                # continue_para = True
 
                 cut_dict = {'video_path':video_path,'label_name':label_name,"stop_fps":fps1,'bbox_list':example_lst}
+                
+                start_fps = example_lst[0]['fps']
 
-                # 添加到新的队列
-                self.cutbboxQueue.put(cut_dict)
+                if count_fps_del <= 3:
+
+                    frame_result_contact = frame_result_contact
+                    count_fps_del = count_fps_del + 1
+
+                else:
 
-                frame_result_contact = [item for item in frame_result_contact if item not in example_lst]
+                    if (fps1 - start_fps) < 10:
+
+                        frame_result_contact = frame_result_contact
+                    else:
+                     
+                        frame_result_contact = [item for item in frame_result_contact if item not in example_lst]
+                        self.cutbboxQueue.put(cut_dict)
                                 
-                # 有新添加目标情况
+            # 有新添加目标情况
             if re_dict_lst:
 
                 # 对比示例列表更新
-                update_list = tools.change_list_dict(fps1=fps1,re_list=re_dict_lst)
+                update_list = tools_function.change_list_dict(fps1=fps1,re_list=re_dict_lst)
 
                 frame_result_contact = frame_result_contact + update_list
 
             # 统计截止时间
-            time_out_list = tools.statistics_fps(fps_now=fps1,re_list=frame_result_contact,parameter=20)
+            time_out_list = tools_function.statistics_fps(fps_now=fps1,re_list=frame_result_contact,parameter=parameter_fps)
                             
-
             if time_out_list:
 
                 # 裁剪保存视频
@@ -291,143 +393,183 @@ class DealVideo():
 
                 # 对比示例列表更新
                 frame_result_contact = [item for item in frame_result_contact if item not in time_out_list]
+            
+            if video_end:
+
+                cut_dict = {'video_path':video_path,'label_name':label_name,"stop_fps":fps1,'bbox_list':frame_result_contact}
+
+                self.cutbboxQueue.put(cut_dict)
+
+                frame_result_contact.clear()
 
             # print('frame_result_contact:',frame_result_contact)
-        
-        return frame_result_contact
+ 
+        return count_fps_del,frame_result_contact
+
+
+    # def get_continue_keys(self,count_fps_del,continue_para,start_fps,now_fps,frame_result_contact,update_frame_result_contact):
+
+    #     # 判断是否有偶然没检测到的情况
+    #     if continue_para:
+
+    #         dert_fps = now_fps - start_fps
+
+    #         print('dert_fps:',dert_fps)
+
+    #         if dert_fps <= 20:
+
+    #             count_fps_del = count_fps_del + 1
+
+    #             if count_fps_del <= 3:
+
+    #                 frame_result_contact = frame_result_contact
+                                        
+    #             else:
+
+    #                 frame_result_contact = update_frame_result_contact
+    #                 count_fps_del = 0
+
+    #         else:
+    #             count_fps_del = 0
+                                                   
+    #     else:
+
+    #         frame_result_contact = update_frame_result_contact
+
+    #     return  count_fps_del,frame_result_contact
+
 
 
     def write_video(self): 
+        # print('write_videoafagragr')
 
         '''  
         保存成视频
         '''
-
         while True:
-
-             if ~self.cutbboxQueue.empty():
-
+            if self.cutbboxQueue.empty():
+                time.sleep(2)
+            else:
                 video_frame_dict = self.cutbboxQueue.get()
-
-                # print('video_frame_dict:',video_frame_dict)
-
                 # 视频路径
                 video_path = video_frame_dict['video_path']
                 video_basename = os.path.basename(video_path).split('.')[0]
                 file_name = video_frame_dict['label_name']
-                # video_name_save = os.path.join(self.video_save_file, video_basename)
-
                 # 原视频帧率和尺寸
                 cap = cv2.VideoCapture(video_path)
                 fps = cap.get(cv2.CAP_PROP_FPS)
+                video_fps = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
 
-                # 获得起始帧
+                print(video_path,'fps:',fps,'video_fps:',video_fps)
+                # 获得起始
                 stop_fps = video_frame_dict['stop_fps']
-
                 # 裁剪信息
                 result_list = video_frame_dict['bbox_list']
+                if  cap.isOpened():
 
-                for i,bbox_dict in enumerate(result_list):
-
-                    start_fps = bbox_dict['fps']
-                    bbox_list = bbox_dict['result']
-
-                    w = int(bbox_list[2]) - int(bbox_list[0])
-                    h = int(bbox_list[3]) - int(bbox_list[1])
+                    for i,bbox_dict in enumerate(result_list):
+                        start_fps = bbox_dict['fps']
 
-                    size = (w,h)
+                        if start_fps >= stop_fps:
 
-                    # 根据标签保存不同视频分类
-                    video_name_save = video_basename + '_' + str(start_fps)  + '_' + str(stop_fps) + '_'  + str(i) + '.avi'
-                    video_save_file = self.video_save_file + '/' + file_name
-                    os.makedirs(video_save_file, exist_ok=True)
-                    video_save_path = os.path.join(video_save_file, video_name_save)
-
-                    videoWriter =cv2.VideoWriter(video_save_path,cv2.VideoWriter_fourcc('X','V','I','D'),fps,size)
+                            print('start_fps:',start_fps,'stop_fps:',stop_fps)
+                            break
 
-                    tools.save_seg_video(video_name=video_path,
-                                                    frameToStart=start_fps,
-                                                    frametoStop=stop_fps,
-                                                    videoWriter=videoWriter,
-                                                    bbox=bbox_list)
-                        
-                    videoWriter.release()
+                        else:
+                            bbox_list = bbox_dict['result']
+                            # w = int(bbox_list[2]) - int(bbox_list[0])
+                            # h = int(bbox_list[3]) - int(bbox_list[1])
+                            size = (200,200)
+                            # 根据标签保存不同视频分类
+                            video_name_save = video_basename + '_' + str(start_fps)  + '_' + str(stop_fps) + '_'  + str(i) + '.avi'
+                            video_save_file = self.video_save_file + '/' + file_name
+                            os.makedirs(video_save_file, exist_ok=True)
+                            video_save_path = os.path.join(video_save_file, video_name_save)
+                            videoWriter =cv2.VideoWriter(video_save_path,cv2.VideoWriter_fourcc('X','V','I','D'),fps,size)
+
+                            tools_function.save_seg_video(video_name=video_path,
+                                                            frameToStart=start_fps,
+                                                            frametoStop=stop_fps,
+                                                            videoWriter=videoWriter,
+                                                            bbox=bbox_list)
+                            videoWriter.release()
+                            self.videoQueue2.put(video_save_path)
+
+                    cap.release()
+                
+                else:
+                    print(video_path)
+                    break
 
-                    self.videoQueue.put(video_save_path)
 
-                cap.release()
 
 
-    def select_video_frame(self):
+    def select_video_path(self):
 
         while True:
 
-            if  ~self.frameQueue.empty():
-
-                video_dict = self.frameQueue.get()
-                video_path = video_dict["video_path"]
+            if self.videoQueue2.empty():
+                time.sleep(5)
+            else:
+                video_path = self.videoQueue2.get()
                 directory = os.path.dirname(video_path)
                 labels = directory.split('/')[-1]
 
-                print('labels:',labels)
-
-                if labels == 'person':
+                print('video_pathagfg:',video_path)
 
-                    self.videopersonQueue.put(video_dict)
+                # print(labels)
 
-                if labels == 'head':
-
-                    # print('youshou')
+                if labels == 'person':
 
-                    self.videoheadQueue.put(video_dict)
-                
-                if labels == 'hands':
+                    self.videoQueue3.put(video_path)
 
-                    # print('youshou')
+                if labels == 'head' or labels == 'hands':
 
-                    self.videohandsQueue.put(video_dict)
+                    self.videodetQueue.put(video_path)
 
                 else:
-
-                    self.videoframeQueue.put(video_dict)
-
+                    pass         
 
     def run(self):
 
         self.get_video_listThread.start()
         self.get_video_frameThread.start()
-        self.person_detThread.start()
         self.write_videoThread.start()
-        self.select_video_frameThread.start()
+        # self.write_videoThread.join()
         self.head_hands_detThread.start()
+        self.video_select_dectThread.start()
+        self.select_video_pathThread.start()
 
 
 
 if __name__ == '__main__':  
-  
-
-
-    video = "E:/Bank_files/Bank_02/dataset/video_test/test03_3.avi"
+    t1 = time.time()
+    video = "E:/Bank_files/Bank_02/dataset/video_test/1min/0711-7_4.avi"
     video_save = 'test_video'
 
-    person_model = YOLO("model_file/yolov8x_person.pt")
+    # 初始化目标检测
+    person_model = YOLO("model_file/yolov8n.pt")
+
+    # 初始化pptsmv2
+    config = 'model_file/inference/pptsm_lcnet_k400_16frames_uniform.yaml'  # 配置文件地址
+    model_file = 'model_file/inference/ppTSMv2.pdmodel'  # 推理模型存放地址
+    params_file = 'model_file/inference/ppTSMv2.pdiparams'
+    # batch_size= 1
+    infer,predictor = PP_TSMv2_predict().create_inference_model(config,model_file,params_file)
+    # PP_TSMv2_predict().predict(config,input_file,batch_size,predictor,infer)
 
+    # 初始化mediapipe
     mp_holistic = mp.solutions.holistic
     holistic = mp_holistic.Holistic(
     min_detection_confidence=0.5,
     min_tracking_confidence=0.5)
 
     # get_seg_video(video_file=video,video_save_path=video_save,dertTime=dertTime)
-    deal = DealVideo(video_file=video,video_save_file=video_save,person_model=person_model,mediapipe_model=holistic,pptsmv2_model='model_file/yolov8x_person.pt')
-    deal.run()
-
-
     
+    deal = DealVideo(video_file=video,video_save_file=video_save,person_model=person_model,mediapipe_model=holistic,pptsmv2_model=[infer,predictor])
+    deal.run()
 
+    t2 = time.time()
 
-
-
-
-
-
+    # print('总时间：',t2-t1)