0808更新项目代码

2 years ago · 530524ff53
parent 65cca19d91
commit 530524ff53
444 changed files with 46725 additions and 427 deletions
--- a/Bank_second_part/detect_process/PP_TSMv2_infer.py
+++ b/Bank_second_part/detect_process/PP_TSMv2_infer.py
@ -0,0 +1,164 @@
+import os 
+import os.path as osp
+from paddlevideo.utils.config import get_config
+from paddle.inference import Config, create_predictor
+from tools.utils import build_inference_helper
+
+class PP_TSMv2_predict(object):
+
+    """PP-TSMv2模型中常用的参数初始化"""
+
+    def __init__(self,use_gpu=True,ir_optim=True,
+                 disable_glog=False,save_name=None,enable_mklddn=False,
+                 precision="fp32",gpu_mem=8000,cpu_threads=None):
+
+        self.use_gpu = use_gpu                                 #是否使用GPU
+        self.cpu_threads = cpu_threads                         #cpu线程数
+        self.ir_optim = ir_optim                               #是否开启IR优化
+        self.disable_glog = disable_glog
+        self.gpu_mem = gpu_mem                                 #GPU存储大小
+        self.enable_mkldnn = enable_mklddn                     #是否开启mkldnn
+        self.precision = precision                             #mfldnn精度
+        self.save_name = save_name                             #转化推理模型存放名称
+
+
+
+    def parse_file_paths(self,input_path: str) -> list:
+        
+        """
+            获取模型输入数据
+            input_path:模型的输入文件
+        """
+        if osp.isfile(input_path):
+            files = [
+                input_path,
+            ]
+        else:
+            files = os.listdir(input_path)
+            files = [
+                file for file in files
+                if (file.endswith(".avi") or file.endswith(".mp4"))
+            ]
+            files = [osp.join(input_path, file) for file in files]
+        return files
+
+
+    def create_paddle_predictor(self,model_f,pretr_p,cfg):
+        """
+            创建推理引擎
+            model_f:可推理模型存放的路径+配置文件
+            pretr_p:训练后的参数存放文件
+            cfg:模型配置文件
+
+        """
+        config = Config(model_f,pretr_p)
+        if self.use_gpu:
+            config.enable_use_gpu(self.gpu_mem,0)
+        else:
+            config.disable_gpu()
+            if self.cpu_threads:
+                config.set_cpu_math_library_num_threads(self.cpu_threads)
+            if self.enable_mkldnn:
+                config.set_mkldnn_cache_capacity(10)
+                config.enable_mkldnn()
+                if self.precision == "fp16":
+                    config.enable_mkldnn_bfloat16()
+
+        config.switch_ir_optim(self.ir_optim)
+        
+        config.enable_memory_optim()
+        config.switch_use_feed_fetch_ops(False)
+
+        if self.disable_glog:
+            config.disable_glog_info()
+
+        predictor = create_predictor(config)
+
+        return config,predictor
+
+    def create_inference_model(self,config,model_f,params_f):
+        """
+            创建推理模型以及引擎
+            config：模型配置文件
+            model_f：可推理模型的存放路径
+            params_f：可推理模型的参数
+        """
+        cfg = get_config(config, overrides=None, show=False)
+        InferenceHelper = build_inference_helper(cfg.INFERENCE)
+        _, predictor = self.create_paddle_predictor(model_f, params_f, cfg)
+
+        return InferenceHelper,predictor
+
+
+    def predict(self,input_f,batch_size,predictor,InferenceHelper):
+
+        """
+            推理模型,对数据进行推理、预测
+            config :PP-TSMv2模型的配置文件
+            input_f:待推理数据集的存放路径
+            batch_size:模型推理中所取数据的多少,default = 1
+            predictor:推理引擎
+            InferenceHelper:推理模型
+        """
+        result = {}
+
+        # cfg = get_config(config, overrides=None, show=False)
+        # model_name = cfg.model_name
+        # print(f"Inference model({model_name})...")
+
+        # get input_tensor and output_tensor
+        input_names = predictor.get_input_names()
+        output_names = predictor.get_output_names()
+        input_tensor_list = []
+        output_tensor_list = []
+        for item in input_names:
+            input_tensor_list.append(predictor.get_input_handle(item))
+        for item in output_names:
+            output_tensor_list.append(predictor.get_output_handle(item))
+
+        files = self.parse_file_paths(input_f)#input_path=input_f
+
+        batch_num = batch_size
+        for st_idx in range(0, len(files), batch_num):
+            ed_idx = min(st_idx + batch_num, len(files))
+
+            #输出数据预处理
+            batched_inputs = InferenceHelper.preprocess_batch(files[st_idx:ed_idx])
+            for i in range(len(input_tensor_list)):
+                input_tensor_list[i].copy_from_cpu(batched_inputs[i])
+
+            #推理引擎开始推理
+            predictor.run()
+
+            batched_outputs = []
+            for j in range(len(output_tensor_list)):
+                batched_outputs.append(output_tensor_list[j].copy_to_cpu())
+
+            #输出推理结果
+            res = InferenceHelper.postprocess(batched_outputs,False,True)     
+        result["video_id"] = res[0]["video_id"]
+        result["topk_class"] = res[0]["topk_class"].tolist()[0]
+        result["topk_scores"] = res[0]["topk_scores"].tolist()[0]
+        # print(result)
+        
+        return result
+            
+
+
+# def main():
+#     config = 'D:/download/PaddleVideo1/output/output/pptsm_lcnet_k400_16frames_uniform.yaml'  # 配置文件地址
+#     input_file='C:/Users/Administrator/Pictures/video_seg_re_hand/test01_3.avi'                      #待推理数据集存放的地址
+#     model_file = 'D:/download/PaddleVideo1/output/output/ppTSMv2.pdmodel'  # 推理模型存放地址
+#     params_file = 'D:/download/PaddleVideo1/output/output/ppTSMv2.pdiparams'
+#     batch_size= 1                                                          #输出推理模型
+#     infer,predictor = PP_TSMv2_predict().create_inference_model(config,model_file,params_file)
+#     PP_TSMv2_predict().predict(config,input_file,batch_size,predictor,infer)                           #推理模型推理、预测
+
+    
+    
+    
+# if __name__ == "__main__":
+#     main()
+
+        
+
--- a/Bank_second_part/detect_process/analysisPoint.py
+++ b/Bank_second_part/detect_process/analysisPoint.py
@ -0,0 +1,152 @@
+# Copyright 2020 The MediaPipe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MediaPipe solution drawing utils."""
+
+import math
+from typing import List, Mapping, Optional, Tuple, Union
+
+import cv2
+import dataclasses
+import matplotlib.pyplot as plt
+import numpy as np
+
+from mediapipe.framework.formats import detection_pb2
+from mediapipe.framework.formats import location_data_pb2
+from mediapipe.framework.formats import landmark_pb2
+
+_PRESENCE_THRESHOLD = 0.5
+_VISIBILITY_THRESHOLD = 0.5
+_BGR_CHANNELS = 3
+
+WHITE_COLOR = (224, 224, 224)
+BLACK_COLOR = (0, 0, 0)
+RED_COLOR = (0, 0, 255)
+GREEN_COLOR = (0, 128, 0)
+BLUE_COLOR = (255, 0, 0)
+
+
+@dataclasses.dataclass
+class DrawingSpec:
+  # Color for drawing the annotation. Default to the white color.
+  color: Tuple[int, int, int] = WHITE_COLOR
+  # Thickness for drawing the annotation. Default to 2 pixels.
+  thickness: int = 2
+  # Circle radius. Default to 2 pixels.
+  circle_radius: int = 2
+
+
+def _normalized_to_pixel_coordinates(
+    normalized_x: float, normalized_y: float, image_width: int,
+    image_height: int) -> Union[None, Tuple[int, int]]:
+  """Converts normalized value pair to pixel coordinates."""
+
+  # Checks if the float value is between 0 and 1.
+  def is_valid_normalized_value(value: float) -> bool:
+    return (value > 0 or math.isclose(0, value)) and (value < 1 or
+                                                      math.isclose(1, value))
+
+  if not (is_valid_normalized_value(normalized_x) and
+          is_valid_normalized_value(normalized_y)):
+    # TODO: Draw coordinates even if it's outside of the image bounds.
+    return None
+  x_px = min(math.floor(normalized_x * image_width), image_width - 1)
+  y_px = min(math.floor(normalized_y * image_height), image_height - 1)
+  return x_px, y_px
+
+
+
+def draw_landmarks(
+    image: np.ndarray,
+    landmark_list: landmark_pb2.NormalizedLandmarkList,
+    connections: Optional[List[Tuple[int, int]]] = None):
+  """Draws the landmarks and the connections on the image.
+
+  Args:
+    image: A three channel BGR image represented as numpy ndarray.
+    landmark_list: A normalized landmark list proto message to be annotated on
+      the image.
+    connections: A list of landmark index tuples that specifies how landmarks to
+      be connected in the drawing.
+    landmark_drawing_spec: Either a DrawingSpec object or a mapping from hand
+      landmarks to the DrawingSpecs that specifies the landmarks' drawing
+      settings such as color, line thickness, and circle radius. If this
+      argument is explicitly set to None, no landmarks will be drawn.
+    connection_drawing_spec: Either a DrawingSpec object or a mapping from hand
+      connections to the DrawingSpecs that specifies the connections' drawing
+      settings such as color and line thickness. If this argument is explicitly
+      set to None, no landmark connections will be drawn.
+
+  Raises:
+    ValueError: If one of the followings:
+      a) If the input image is not three channel BGR.
+      b) If any connetions contain invalid landmark index.
+  """
+  if not landmark_list:
+    return
+  if image.shape[2] != _BGR_CHANNELS:
+    raise ValueError('Input image must contain three channel bgr data.')
+  image_rows, image_cols, _ = image.shape
+
+  # 所有的点转换成坐标的字典
+  idx_to_coordinates = {}
+  for idx, landmark in enumerate(landmark_list.landmark):
+    # print('landmark:',landmark)
+    if ((landmark.HasField('visibility') and
+         landmark.visibility < _VISIBILITY_THRESHOLD) or
+        (landmark.HasField('presence') and
+         landmark.presence < _PRESENCE_THRESHOLD)):
+      continue
+    landmark_px = _normalized_to_pixel_coordinates(landmark.x, landmark.y,
+                                                   image_cols, image_rows)
+    # print('landmark_px:',landmark_px)
+    if landmark_px:
+      idx_to_coordinates[idx] = landmark_px
+
+
+  if connections:
+    num_landmarks = len(landmark_list.landmark)
+    # print('connections:',connections)
+
+    # Draws the connections if the start and end landmarks are both visible.
+    
+    start_list = []
+    end_list = []
+    for connection in connections:
+      # print(connection)
+
+      start_idx = connection[0]
+      end_idx = connection[1]
+
+      start_list.append(start_idx)
+      end_list.append(end_idx)
+
+    
+    point_list = []
+    for point_idx in end_list:
+
+      # if point_idx not in start_list:
+      
+              # print(point_idx)
+        point_list.append(point_idx)
+
+
+    point_axis_list = []
+    for point in point_list:
+      
+      if point in list(idx_to_coordinates.keys()):
+        point_axis_list.append(idx_to_coordinates[point])
+       
+      
+  return point_axis_list
+      
--- a/Bank_second_part/detect_process/holisticDet.py
+++ b/Bank_second_part/detect_process/holisticDet.py
@ -0,0 +1,104 @@
+import cv2
+import mediapipe as mp
+
+import analysisPoint as mp_drawing
+mp_holistic = mp.solutions.holistic
+import numpy as np
+
+class MediapipeProcess:
+
+  def mediapipe_det(image,holistic):
+
+    '''
+    调用模型推理获得检测结果
+    '''
+
+    image.flags.writeable = False
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    results = holistic.process(image)
+
+    return results
+
+  def get_analysis_result(image,results):
+
+    '''
+    images: 检测的图片
+    results: 图片的检测结果
+    对上述结果进行分析
+    '''
+
+    image.flags.writeable = True
+    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+
+    face_result = mp_drawing.draw_landmarks(
+              image,
+              results.face_landmarks,
+              mp_holistic.FACEMESH_CONTOURS)    
+    
+    right_hand_result = mp_drawing.draw_landmarks(
+              image,
+              results.right_hand_landmarks,
+              mp_holistic.HAND_CONNECTIONS)
+
+    left_hand_result = mp_drawing.draw_landmarks(
+              image,
+              results.left_hand_landmarks,
+              mp_holistic.HAND_CONNECTIONS)
+    
+    face_bbox = MediapipeProcess.point_to_bbox(face_result)
+    right_hand_bbox = MediapipeProcess.point_to_bbox(right_hand_result)
+    left_hand_bbox = MediapipeProcess.point_to_bbox(left_hand_result)
+
+    result_dict = {'face_bbox':[face_bbox],'hand_bbox':[right_hand_bbox,left_hand_bbox]}
+
+
+    return result_dict
+        
+
+
+  def point_to_bbox(result_list):
+      
+    '''
+    根据关键点坐标，获取坐标点的最小外接矩形
+    '''
+      
+    result_array = np.array(result_list)
+      
+    if result_array.all():
+
+      rect = cv2.minAreaRect(result_array) # 得到最小外接矩形的（中心(x,y), (宽,高), 旋转角度）
+      bbox = cv2.boxPoints(rect) # 获取最小外接矩形的4个顶点坐标(ps: cv2.boxPoints(rect) for OpenCV 3.x)
+      bbox = np.int0(bbox)
+      bbox=bbox.tolist() 
+
+      left_top = [min(bbox, key=lambda p: p[0])[0], min(bbox, key=lambda p: p[1])[1]]
+      right_bottom = [max(bbox, key=lambda p: p[0])[0], max(bbox, key=lambda p: p[1])[1]]
+
+      bbox_list = left_top + right_bottom
+
+      # print('bbox:',bbox)
+      # print('bbox_list:',bbox_list)
+
+
+      # bbox_list = []
+
+      # bbox_list.append(bbox[0][0])
+      # bbox_list.append(bbox[0][1])
+      # bbox_list.append(bbox[2][0])
+      # bbox_list.append(bbox[2][1])
+ 
+      return bbox_list
+
+    else:
+      pass 
+
+
+
+
+
+   
+
+
+# if __name__ == '__main__':  
+#   # media_holistic(video_file='E:/Bank_files/Bank_02/dataset/video_person/after_1/0711-1_199_0.avi',
+  #                video_save_path='E:/Bank_files/Bank_02/videos_mediapipe/test_data/0725_test')
--- a/Bank_second_part/detect_process/paddlevideo/init.py
+++ b/Bank_second_part/detect_process/paddlevideo/init.py
@ -0,0 +1,15 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .version import paddlevideo_version
--- a/Bank_second_part/detect_process/paddlevideo/pycache/init.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/pycache/init.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/pycache/version.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/pycache/version.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/init.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/init.py
@ -0,0 +1,22 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .builder import build_dataset, build_dataloader, build_batch_pipeline
+from .dataset import VideoDataset
+from .dali_loader import TSN_Dali_loader, get_input_data
+
+__all__ = [
+    'build_dataset', 'build_dataloader', 'build_batch_pipeline', 'VideoDataset',
+    'TSN_Dali_loader', 'get_input_data'
+]
--- a/Bank_second_part/detect_process/paddlevideo/loader/pycache/init.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pycache/init.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/pycache/builder.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pycache/builder.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/pycache/dali_loader.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pycache/dali_loader.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/pycache/registry.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pycache/registry.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/builder.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/builder.py
@ -0,0 +1,132 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import signal
+import os
+import paddle
+from paddle.io import DataLoader, DistributedBatchSampler
+from .registry import DATASETS, PIPELINES
+from ..utils.build_utils import build
+from .pipelines.compose import Compose
+from paddlevideo.utils import get_logger
+from paddlevideo.utils.multigrid import DistributedShortSampler
+import numpy as np
+
+logger = get_logger("paddlevideo")
+
+
+def build_pipeline(cfg):
+    """Build pipeline.
+    Args:
+        cfg (dict): root config dict.
+    """
+    if cfg == None:
+        return
+    return Compose(cfg)
+
+
+def build_dataset(cfg):
+    """Build dataset.
+    Args:
+        cfg (dict): root config dict.
+
+    Returns:
+        dataset: dataset.
+    """
+    #XXX: ugly code here!
+    cfg_dataset, cfg_pipeline = cfg
+    cfg_dataset.pipeline = build_pipeline(cfg_pipeline)
+    dataset = build(cfg_dataset, DATASETS, key="format")
+    return dataset
+
+
+def build_batch_pipeline(cfg):
+
+    batch_pipeline = build(cfg, PIPELINES)
+    return batch_pipeline
+
+
+def build_dataloader(dataset,
+                     batch_size,
+                     num_workers,
+                     places,
+                     shuffle=True,
+                     drop_last=True,
+                     multigrid=False,
+                     collate_fn_cfg=None,
+                     **kwargs):
+    """Build Paddle Dataloader.
+
+    XXX explain how the dataloader work!
+
+    Args:
+        dataset (paddle.dataset): A PaddlePaddle dataset object.
+        batch_size (int): batch size on single card.
+        num_worker (int): num_worker
+        shuffle(bool): whether to shuffle the data at every epoch.
+    """
+    if multigrid:
+        sampler = DistributedShortSampler(dataset,
+                                          batch_sizes=batch_size,
+                                          shuffle=True,
+                                          drop_last=True)
+    else:
+        sampler = DistributedBatchSampler(dataset,
+                                          batch_size=batch_size,
+                                          shuffle=shuffle,
+                                          drop_last=drop_last)
+
+    #NOTE(shipping): when switch the mix operator on, such as: mixup, cutmix.
+    # batch like: [[img, label, attibute, ...], [imgs, label, attribute, ...], ...] will recollate to:
+    # [[img, img, ...], [label, label, ...], [attribute, attribute, ...], ...] as using numpy.transpose.
+
+    def mix_collate_fn(batch):
+        pipeline = build_batch_pipeline(collate_fn_cfg)
+        batch = pipeline(batch)
+        slots = []
+        for items in batch:
+            for i, item in enumerate(items):
+                if len(slots) < len(items):
+                    slots.append([item])
+                else:
+                    slots[i].append(item)
+        return [np.stack(slot, axis=0) for slot in slots]
+
+    #if collate_fn_cfg is not None:
+    #ugly code here. collate_fn is mix op config
+    #    collate_fn = mix_collate_fn(collate_fn_cfg)
+
+    data_loader = DataLoader(
+        dataset,
+        batch_sampler=sampler,
+        places=places,
+        num_workers=num_workers,
+        collate_fn=mix_collate_fn if collate_fn_cfg is not None else None,
+        return_list=True,
+        **kwargs)
+
+    return data_loader
+
+
+def term_mp(sig_num, frame):
+    """ kill all child processes
+    """
+    pid = os.getpid()
+    pgid = os.getpgid(os.getpid())
+    logger.info("main proc {} exit, kill process group " "{}".format(pid, pgid))
+    os.killpg(pgid, signal.SIGKILL)
+    return
+
+
+signal.signal(signal.SIGINT, term_mp)
+signal.signal(signal.SIGTERM, term_mp)
--- a/Bank_second_part/detect_process/paddlevideo/loader/dali_loader.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dali_loader.py
@ -0,0 +1,206 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import math
+
+import paddle
+from paddle.distributed import ParallelEnv
+import paddle.distributed as dist
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+try:
+    from nvidia.dali.pipeline import Pipeline
+    import nvidia.dali.ops as ops
+    import nvidia.dali.types as types
+    import tempfile
+    from nvidia.dali.plugin.paddle import DALIGenericIterator
+except:
+    Pipeline = object
+
+
+def get_input_data(data):
+    return paddle.to_tensor(data[0]['image']), paddle.to_tensor(
+        data[0]['label'])
+
+
+class TSN_Dali_loader(object):
+    def __init__(self, cfg):
+        self.batch_size = cfg.batch_size
+        self.file_path = cfg.file_path
+
+        self.num_seg = cfg.num_seg
+        self.seglen = cfg.seglen
+        self.short_size = cfg.short_size
+        self.target_size = cfg.target_size
+
+        # set num_shards and shard_id when distributed training is implemented
+        self.num_shards = dist.get_world_size()
+        self.shard_id = ParallelEnv().local_rank
+        self.dali_mean = cfg.mean * (self.num_seg * self.seglen)
+        self.dali_std = cfg.std * (self.num_seg * self.seglen)
+
+    def build_dali_reader(self):
+        """
+        build dali training reader
+        """
+        def reader_():
+            with open(self.file_path) as flist:
+                full_lines = [line for line in flist]
+                if (not hasattr(reader_, 'seed')):
+                    reader_.seed = 0
+                random.Random(reader_.seed).shuffle(full_lines)
+                logger.info(f"reader shuffle seed: {reader_.seed}.")
+                if reader_.seed is not None:
+                    reader_.seed += 1
+
+                per_node_lines = int(
+                    math.ceil(len(full_lines) * 1.0 / self.num_shards))
+                total_lines = per_node_lines * self.num_shards
+
+                # aligned full_lines so that it can evenly divisible
+                full_lines += full_lines[:(total_lines - len(full_lines))]
+                assert len(full_lines) == total_lines
+
+                # trainer get own sample
+                lines = full_lines[self.shard_id:total_lines:self.num_shards]
+                assert len(lines) == per_node_lines
+
+                logger.info(
+                    f"shard_id: {self.shard_id}, trainer_count: {self.num_shards}"
+                )
+                logger.info(
+                    f"read videos from {self.shard_id * per_node_lines}, "
+                    f"length: {per_node_lines}, "
+                    f"lines length: {len(lines)}, "
+                    f"total: {len(full_lines)}")
+
+            video_files = ''.join([item for item in lines])
+            tf = tempfile.NamedTemporaryFile()
+            tf.write(str.encode(video_files))
+            tf.flush()
+            video_files = tf.name
+
+            device_id = ParallelEnv().local_rank
+            logger.info(f'---------- device_id: {device_id} -----------')
+
+            pipe = VideoPipe(batch_size=self.batch_size,
+                             num_threads=1,
+                             device_id=device_id,
+                             file_list=video_files,
+                             sequence_length=self.num_seg * self.seglen,
+                             num_seg=self.num_seg,
+                             seg_length=self.seglen,
+                             resize_shorter_scale=self.short_size,
+                             crop_target_size=self.target_size,
+                             is_training=True,
+                             num_shards=self.num_shards,
+                             shard_id=self.shard_id,
+                             dali_mean=self.dali_mean,
+                             dali_std=self.dali_std)
+
+            logger.info(
+                'initializing dataset, it will take several minutes if it is too large .... '
+            )
+            video_loader = DALIGenericIterator([pipe], ['image', 'label'],
+                                               len(lines),
+                                               dynamic_shape=True,
+                                               auto_reset=True)
+
+            return video_loader
+
+        dali_reader = reader_()
+        return dali_reader
+
+
+class VideoPipe(Pipeline):
+    def __init__(self,
+                 batch_size,
+                 num_threads,
+                 device_id,
+                 file_list,
+                 sequence_length,
+                 num_seg,
+                 seg_length,
+                 resize_shorter_scale,
+                 crop_target_size,
+                 is_training=False,
+                 initial_prefetch_size=20,
+                 num_shards=1,
+                 shard_id=0,
+                 dali_mean=0.,
+                 dali_std=1.0):
+        super(VideoPipe, self).__init__(batch_size, num_threads, device_id)
+        self.input = ops.VideoReader(device="gpu",
+                                     file_list=file_list,
+                                     sequence_length=sequence_length,
+                                     num_seg=num_seg,
+                                     seg_length=seg_length,
+                                     is_training=is_training,
+                                     num_shards=num_shards,
+                                     shard_id=shard_id,
+                                     random_shuffle=is_training,
+                                     initial_fill=initial_prefetch_size)
+        # the sequece data read by ops.VideoReader is of shape [F, H, W, C]
+        # Because the ops.Resize does not support sequence data,
+        # it will be transposed into [H, W, F, C],
+        # then reshaped to [H, W, FC], and then resized like a 2-D image.
+        self.transpose = ops.Transpose(device="gpu", perm=[1, 2, 0, 3])
+        self.reshape = ops.Reshape(device="gpu",
+                                   rel_shape=[1.0, 1.0, -1],
+                                   layout='HWC')
+        self.resize = ops.Resize(device="gpu",
+                                 resize_shorter=resize_shorter_scale)
+        # crops and mirror are applied by ops.CropMirrorNormalize.
+        # Normalization will be implemented in paddle due to the difficulty of dimension broadcast,
+        # It is not sure whether dimension broadcast can be implemented correctly by dali, just take the Paddle Op instead.
+        self.pos_rng_x = ops.Uniform(range=(0.0, 1.0))
+        self.pos_rng_y = ops.Uniform(range=(0.0, 1.0))
+        self.mirror_generator = ops.Uniform(range=(0.0, 1.0))
+        self.cast_mirror = ops.Cast(dtype=types.DALIDataType.INT32)
+        self.crop_mirror_norm = ops.CropMirrorNormalize(
+            device="gpu",
+            crop=[crop_target_size, crop_target_size],
+            mean=dali_mean,
+            std=dali_std)
+        self.reshape_back = ops.Reshape(
+            device="gpu",
+            shape=[num_seg, seg_length * 3, crop_target_size, crop_target_size],
+            layout='FCHW')
+        self.cast_label = ops.Cast(device="gpu", dtype=types.DALIDataType.INT64)
+
+    def define_graph(self):
+        output, label = self.input(name="Reader")
+        output = self.transpose(output)
+        output = self.reshape(output)
+
+        output = self.resize(output)
+        output = output / 255.
+        pos_x = self.pos_rng_x()
+        pos_y = self.pos_rng_y()
+        mirror_flag = self.mirror_generator()
+        mirror_flag = (mirror_flag > 0.5)
+        mirror_flag = self.cast_mirror(mirror_flag)
+        output = self.crop_mirror_norm(output,
+                                       crop_pos_x=pos_x,
+                                       crop_pos_y=pos_y,
+                                       mirror=mirror_flag)
+        output = self.reshape_back(output)
+        label = self.cast_label(label)
+        return output, label
+
+    def __len__(self):
+        return self.epoch_size()
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/MRI.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/MRI.py
@ -0,0 +1,109 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class MRIDataset(BaseDataset):
+    """Rawframe dataset for action recognition.
+    The dataset loads raw frames from frame files, and apply specified transform operatation them.
+    The indecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.
+    Example of an index file:
+
+    .. code-block:: txt
+
+        file_path-1 150 1
+        file_path-2 160 1
+        file_path-3 170 2
+        file_path-4 180 2
+
+    Args:
+        file_path (str): Path to the index file.
+        pipeline(XXX):
+        data_prefix (str): directory path of the data. Default: None.
+        test_mode (bool): Whether to bulid the test dataset. Default: False.
+        suffix (str): suffix of file. Default: 'img_{:05}.jpg'.
+
+    """
+    def __init__(self,
+                 file_path,
+                 pipeline,
+                 num_retries=5,
+                 data_prefix=None,
+                 test_mode=False,
+                 suffix='img_{:05}.jpg'):
+        self.num_retries = num_retries
+        self.suffix = suffix
+        super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split()
+                frame_dir, frames_len, labels = line_split
+                if self.data_prefix is not None:
+                    frame_dir = osp.join(self.data_prefix, frame_dir)
+                info.append(
+                    dict(
+                        frame_dir=frame_dir,
+                        #suffix=self.suffix,
+                        frames_len=frames_len,
+                        labels=int(labels)))
+        return info
+
+    def prepare_train(self, idx):
+        """Prepare the frames for training/valid gisven index. """
+        #Try to catch Exception caused by reading missing frames files
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['frame_dir'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return np.array(results['imgs']), np.array([results['labels']])
+
+    def prepare_test(self, idx):
+        """Prepare the frames for test given index. """
+        #Try to catch Exception caused by reading missing frames files
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['frame_dir'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return np.array(results['imgs']), np.array([results['labels']])
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/MRI_SlowFast.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/MRI_SlowFast.py
@ -0,0 +1,111 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class SFMRIDataset(BaseDataset):
+    """Rawframe dataset for action recognition.
+    The dataset loads raw frames from frame files, and apply specified transform operatation them.
+    The indecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.
+    Example of an index file:
+
+    .. code-block:: txt
+
+        file_path-1 150 1
+        file_path-2 160 1
+        file_path-3 170 2
+        file_path-4 180 2
+
+    Args:
+        file_path (str): Path to the index file.
+        pipeline(XXX):
+        data_prefix (str): directory path of the data. Default: None.
+        test_mode (bool): Whether to bulid the test dataset. Default: False.
+        suffix (str): suffix of file. Default: 'img_{:05}.jpg'.
+
+    """
+    def __init__(self,
+                 file_path,
+                 pipeline,
+                 num_retries=5,
+                 data_prefix=None,
+                 test_mode=False,
+                 suffix='img_{:05}.jpg'):
+        self.num_retries = num_retries
+        self.suffix = suffix
+        super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split()
+                frame_dir, frames_len, labels = line_split
+                if self.data_prefix is not None:
+                    frame_dir = osp.join(self.data_prefix, frame_dir)
+                info.append(
+                    dict(
+                        frame_dir=frame_dir,
+                        #suffix=self.suffix,
+                        frames_len=frames_len,
+                        labels=int(labels)))
+        return info
+
+    def prepare_train(self, idx):
+        """Prepare the frames for training/valid gisven index. """
+        #Try to catch Exception caused by reading missing frames files
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['frame_dir'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return np.array(results['imgs'][0]), np.array(
+                results['imgs'][1]), np.array([results['labels']])
+
+    def prepare_test(self, idx):
+        """Prepare the frames for test given index. """
+        #Try to catch Exception caused by reading missing frames files
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['frame_dir'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return np.array(results['imgs'][0]), np.array(
+                results['imgs'][1]), np.array([results['labels']])
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/init.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/init.py
@ -0,0 +1,41 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .actbert_dataset import ActBertDataset
+from .ava_dataset import AVADataset
+from .bmn_dataset import BMNDataset
+from .davis_dataset import DavisDataset
+from .feature import FeatureDataset
+from .frame import FrameDataset, FrameDataset_Sport
+from .MRI import MRIDataset
+from .MRI_SlowFast import SFMRIDataset
+from .msrvtt import MSRVTTDataset
+from .actbert_dataset import ActBertDataset
+from .asrf_dataset import ASRFDataset
+from .ms_tcn_dataset import MSTCNDataset
+from .oxford import MonoDataset
+from .skeleton import SkeletonDataset
+from .slowfast_video import SFVideoDataset
+from .video import VideoDataset
+from .ucf101_skeleton import UCF101SkeletonDataset
+from .ucf24_dataset import UCF24Dataset
+
+
+__all__ = [
+    'VideoDataset', 'FrameDataset', 'SFVideoDataset', 'BMNDataset',
+    'FeatureDataset', 'SkeletonDataset', 'AVADataset', 'MonoDataset',
+    'MSRVTTDataset', 'ActBertDataset', 'DavisDataset', 'MRIDataset',
+    'SFMRIDataset', 'FrameDataset_Sport', 'MSTCNDataset', 'ASRFDataset',
+    'UCF101SkeletonDataset', 'UCF24Dataset'
+]
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/MRI.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/MRI.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/MRI_SlowFast.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/MRI_SlowFast.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/init.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/init.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/actbert_dataset.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/actbert_dataset.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/asrf_dataset.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/asrf_dataset.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/ava_dataset.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/ava_dataset.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/base.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/base.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/bmn_dataset.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/bmn_dataset.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/davis_dataset.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/davis_dataset.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/feature.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/feature.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/frame.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/frame.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/ms_tcn_dataset.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/ms_tcn_dataset.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/msrvtt.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/msrvtt.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/oxford.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/oxford.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/skeleton.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/skeleton.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/slowfast_video.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/slowfast_video.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/ucf101_skeleton.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/ucf101_skeleton.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/ucf24_dataset.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/ucf24_dataset.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/video.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/pycache/video.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/actbert_dataset.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/actbert_dataset.py
@ -0,0 +1,74 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+try:
+    import lmdb
+except ImportError as e:
+    print(
+        f"Warning! {e}, [lmdb] package and it's dependencies is required for ActBERT."
+    )
+import pickle
+import json
+try:
+    from paddlenlp.transformers import BertTokenizer
+except ImportError as e:
+    print(
+        f"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT."
+    )
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class ActBertDataset(BaseDataset):
+    """ActBert dataset.
+    """
+    def __init__(
+        self,
+        file_path,
+        pipeline,
+        bert_model="bert-base-uncased",
+        data_prefix=None,
+        test_mode=False,
+    ):
+        self.bert_model = bert_model
+        super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        feature_data = np.load(self.file_path, allow_pickle=True)
+        self.tokenizer = BertTokenizer.from_pretrained(self.bert_model,
+                                                       do_lower_case=True)
+        self.info = []
+        for item in feature_data:
+            self.info.append(dict(feature=item, tokenizer=self.tokenizer))
+        return self.info
+
+    def prepare_train(self, idx):
+        """Prepare the frames for training/valid given index. """
+        results = copy.deepcopy(self.info[idx])
+        #print('==results==', results)
+        results = self.pipeline(results)
+        return results['features']
+
+    def prepare_test(self, idx):
+        """Prepare the frames for test given index. """
+        pass
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/asrf_dataset.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/asrf_dataset.py
@ -0,0 +1,104 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+
+import os
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class ASRFDataset(BaseDataset):
+    """Video dataset for action segmentation.
+    """
+
+    def __init__(
+        self,
+        file_path,
+        pipeline,
+        feature_path,
+        label_path,
+        boundary_path,
+        **kwargs,
+    ):
+        super().__init__(file_path, pipeline, **kwargs)
+        self.label_path = label_path
+        self.boundary_path = boundary_path
+        self.feature_path = feature_path
+
+    def load_file(self):
+        """Load index file to get video information."""
+        file_ptr = open(self.file_path, 'r')
+        info = file_ptr.read().split('\n')[:-1]
+        file_ptr.close()
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID: Prepare data for training/valid given the index."""
+        results = {}
+        video_name = self.info[idx]
+        # load video feature
+        file_name = video_name.split('.')[0] + ".npy"
+        feat_file_path = os.path.join(self.feature_path, file_name)
+        #TODO: check path
+        video_feat = np.load(feat_file_path)
+
+        # load label
+        file_name = video_name.split('.')[0] + ".npy"
+        label_file_path = os.path.join(self.label_path, file_name)
+        label = np.load(label_file_path).astype(np.int64)
+
+        # load boundary
+        file_name = video_name.split('.')[0] + ".npy"
+        boundary_file_path = os.path.join(self.boundary_path, file_name)
+        boundary = np.expand_dims(np.load(boundary_file_path),axis=0).astype(np.float32)
+
+        results['video_feat'] = copy.deepcopy(video_feat)
+        results['video_label'] = copy.deepcopy(label)
+        results['video_boundary'] = copy.deepcopy(boundary)
+
+        results = self.pipeline(results)
+        return results['video_feat'], results['video_label'], results['video_boundary']
+
+    def prepare_test(self, idx):
+        """TEST: Prepare the data for test given the index."""
+        results = {}
+        video_name = self.info[idx]
+        # load video feature
+        file_name = video_name.split('.')[0] + ".npy"
+        feat_file_path = os.path.join(self.feature_path, file_name)
+        #TODO: check path
+        video_feat = np.load(feat_file_path)
+
+        # load label
+        file_name = video_name.split('.')[0] + ".npy"
+        label_file_path = os.path.join(self.label_path, file_name)
+        label = np.load(label_file_path).astype(np.int64)
+
+        # load boundary
+        file_name = video_name.split('.')[0] + ".npy"
+        boundary_file_path = os.path.join(self.boundary_path, file_name)
+        boundary = np.expand_dims(np.load(boundary_file_path),axis=0).astype(np.float32)
+
+        results['video_feat'] = copy.deepcopy(video_feat)
+        results['video_label'] = copy.deepcopy(label)
+        results['video_boundary'] = copy.deepcopy(boundary)
+
+        results = self.pipeline(results)
+        return results['video_feat'], results['video_label'], results['video_boundary']
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/ava_dataset.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/ava_dataset.py
@ -0,0 +1,249 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+import sys
+import os
+import pickle
+from datetime import datetime
+from ...metrics.ava_utils import ava_evaluate_results
+from ..registry import DATASETS
+from .base import BaseDataset
+from collections import defaultdict
+
+
+@DATASETS.register()
+class AVADataset(BaseDataset):
+    """AVA dataset for spatial temporal detection.
+    the dataset loads raw frames, bounding boxes, proposals and applies
+    transformations to return the frame tensors and other information.
+    """
+
+    _FPS = 30
+
+    def __init__(self,
+                 pipeline,
+                 file_path=None,
+                 exclude_file=None,
+                 label_file=None,
+                 suffix='{:05}.jpg',
+                 proposal_file=None,
+                 person_det_score_thr=0.9,
+                 num_classes=81,
+                 data_prefix=None,
+                 test_mode=False,
+                 num_max_proposals=1000,
+                 timestamp_start=900,
+                 timestamp_end=1800):
+        self.custom_classes = None
+        self.exclude_file = exclude_file
+        self.label_file = label_file
+        self.proposal_file = proposal_file
+        assert 0 <= person_det_score_thr <= 1, (
+            'The value of '
+            'person_det_score_thr should in [0, 1]. ')
+        self.person_det_score_thr = person_det_score_thr
+        self.num_classes = num_classes
+        self.suffix = suffix
+        self.num_max_proposals = num_max_proposals
+        self.timestamp_start = timestamp_start
+        self.timestamp_end = timestamp_end
+        super().__init__(
+            file_path,
+            pipeline,
+            data_prefix,
+            test_mode,
+        )
+        if self.proposal_file is not None:
+            self.proposals = self._load(self.proposal_file)
+        else:
+            self.proposals = None
+        if not test_mode:
+            valid_indexes = self.filter_exclude_file()
+            self.info = self.info = [self.info[i] for i in valid_indexes]
+
+    def _load(self, path):
+        f = open(path, 'rb')
+        res = pickle.load(f)
+        f.close()
+        return res
+
+    def parse_img_record(self, img_records):
+        bboxes, labels, entity_ids = [], [], []
+        while len(img_records) > 0:
+            img_record = img_records[0]
+            num_img_records = len(img_records)
+            selected_records = list(
+                filter(
+                    lambda x: np.array_equal(x['entity_box'], img_record[
+                        'entity_box']), img_records))
+            num_selected_records = len(selected_records)
+            img_records = list(
+                filter(
+                    lambda x: not np.array_equal(x['entity_box'], img_record[
+                        'entity_box']), img_records))
+            assert len(img_records) + num_selected_records == num_img_records
+
+            bboxes.append(img_record['entity_box'])
+            valid_labels = np.array([
+                selected_record['label'] for selected_record in selected_records
+            ])
+
+            label = np.zeros(self.num_classes, dtype=np.float32)
+            label[valid_labels] = 1.
+
+            labels.append(label)
+            entity_ids.append(img_record['entity_id'])
+
+        bboxes = np.stack(bboxes)
+        labels = np.stack(labels)
+        entity_ids = np.stack(entity_ids)
+        return bboxes, labels, entity_ids
+
+    def filter_exclude_file(self):
+        valid_indexes = []
+        if self.exclude_file is None:
+            valid_indexes = list(range(len(self.info)))
+        else:
+            exclude_video_infos = [
+                x.strip().split(',') for x in open(self.exclude_file)
+            ]
+            for i, video_info in enumerate(self.info):
+                valid_indexes.append(i)
+                for video_id, timestamp in exclude_video_infos:
+                    if (video_info['video_id'] == video_id
+                            and video_info['timestamp'] == int(timestamp)):
+                        valid_indexes.pop()
+                        break
+        return valid_indexes
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        records_dict_by_img = defaultdict(list)
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split(',')
+
+                video_id = line_split[0]
+                timestamp = int(line_split[1])
+                img_key = f'{video_id},{timestamp:04d}'
+
+                entity_box = np.array(list(map(float, line_split[2:6])))
+                label = int(line_split[6])
+                entity_id = int(line_split[7])
+                shot_info = (0, (self.timestamp_end - self.timestamp_start) *
+                             self._FPS)
+
+                video_info = dict(video_id=video_id,
+                                  timestamp=timestamp,
+                                  entity_box=entity_box,
+                                  label=label,
+                                  entity_id=entity_id,
+                                  shot_info=shot_info)
+                records_dict_by_img[img_key].append(video_info)
+
+        for img_key in records_dict_by_img:
+            video_id, timestamp = img_key.split(',')
+            bboxes, labels, entity_ids = self.parse_img_record(
+                records_dict_by_img[img_key])
+            ann = dict(gt_bboxes=bboxes,
+                       gt_labels=labels,
+                       entity_ids=entity_ids)
+            frame_dir = video_id
+            if self.data_prefix is not None:
+                frame_dir = osp.join(self.data_prefix, frame_dir)
+            video_info = dict(frame_dir=frame_dir,
+                              video_id=video_id,
+                              timestamp=int(timestamp),
+                              img_key=img_key,
+                              shot_info=shot_info,
+                              fps=self._FPS,
+                              ann=ann)
+            info.append(video_info)
+
+        return info
+
+    def prepare_train(self, idx):
+        results = copy.deepcopy(self.info[idx])
+        img_key = results['img_key']
+
+        results['suffix'] = self.suffix
+        results['timestamp_start'] = self.timestamp_start
+        results['timestamp_end'] = self.timestamp_end
+
+        if self.proposals is not None:
+            if img_key not in self.proposals:
+                results['proposals'] = np.array([[0, 0, 1, 1]])
+                results['scores'] = np.array([1])
+            else:
+                proposals = self.proposals[img_key]
+                assert proposals.shape[-1] in [4, 5]
+                if proposals.shape[-1] == 5:
+                    thr = min(self.person_det_score_thr, max(proposals[:, 4]))
+                    positive_inds = (proposals[:, 4] >= thr)
+                    proposals = proposals[positive_inds]
+                    proposals = proposals[:self.num_max_proposals]
+                    results['proposals'] = proposals[:, :4]
+                    results['scores'] = proposals[:, 4]
+                else:
+                    proposals = proposals[:self.num_max_proposals]
+                    results['proposals'] = proposals
+
+        ann = results.pop('ann')
+        results['gt_bboxes'] = ann['gt_bboxes']
+        results['gt_labels'] = ann['gt_labels']
+        results['entity_ids'] = ann['entity_ids']
+
+        #ret = self.pipeline(results, "")
+        ret = self.pipeline(results)
+        #padding for dataloader
+        len_proposals = ret['proposals'].shape[0]
+        len_gt_bboxes = ret['gt_bboxes'].shape[0]
+        len_gt_labels = ret['gt_labels'].shape[0]
+        len_scores = ret['scores'].shape[0]
+        len_entity_ids = ret['entity_ids'].shape[0]
+        padding_len = 128
+        ret['proposals'] = self.my_padding_2d(ret['proposals'], padding_len)
+        ret['gt_bboxes'] = self.my_padding_2d(ret['gt_bboxes'], padding_len)
+        ret['gt_labels'] = self.my_padding_2d(ret['gt_labels'], padding_len)
+        ret['scores'] = self.my_padding_1d(ret['scores'], padding_len)
+        ret['entity_ids'] = self.my_padding_1d(ret['entity_ids'], padding_len)
+        return ret['imgs'][0], ret['imgs'][1], ret['proposals'], ret[
+            'gt_bboxes'], ret['gt_labels'], ret['scores'], ret[
+                'entity_ids'], np.array(
+                    ret['img_shape'], dtype=int
+                ), idx, len_proposals, len_gt_bboxes, len_gt_labels, len_scores, len_entity_ids
+
+    def my_padding_2d(self, feat, max_len):
+        feat_add = np.zeros((max_len - feat.shape[0], feat.shape[1]),
+                            dtype=np.float32)
+        feat_pad = np.concatenate((feat, feat_add), axis=0)
+        return feat_pad
+
+    def my_padding_1d(self, feat, max_len):
+        feat_add = np.zeros((max_len - feat.shape[0]), dtype=np.float32)
+        feat_pad = np.concatenate((feat, feat_add), axis=0)
+        return feat_pad
+
+    def prepare_test(self, idx):
+        return self.prepare_train(idx)
+
+    def evaluate(self, results):
+        return ava_evaluate_results(self.info, len(self), results,
+                                    self.custom_classes, self.label_file,
+                                    self.file_path, self.exclude_file)
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/base.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/base.py
@ -0,0 +1,80 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import numpy as np
+from abc import ABC, abstractmethod
+
+import paddle
+from paddle.io import Dataset
+
+
+class BaseDataset(Dataset, ABC):
+    """Base class for datasets
+
+    All datasets should subclass it.
+    All subclass should overwrite:
+
+    - Method: `load_file`, load info from index file.
+    - Method: `prepare_train`, providing train data.
+    - Method: `prepare_test`, providing test data.
+
+    Args:
+        file_path (str): index file path.
+        pipeline (Sequence XXX)
+        data_prefix (str): directory path of the data. Default: None.
+        test_mode (bool): whether to build test dataset. Default: False.
+
+    """
+    def __init__(self, file_path, pipeline, data_prefix=None, test_mode=False):
+        super().__init__()
+        self.file_path = file_path
+        self.data_prefix = osp.realpath(data_prefix) if \
+            data_prefix is not None and osp.isdir(data_prefix) else data_prefix
+        self.test_mode = test_mode
+        self.pipeline = pipeline
+        self.info = self.load_file()
+
+    @abstractmethod
+    def load_file(self):
+        """load the video information from the index file path."""
+        pass
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID. Prepare the data for training/valid given the index."""
+        #Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        #unsqueeze label to list
+        return results['imgs'], np.array([results['labels']])
+
+    def prepare_test(self, idx):
+        """TEST: Prepare the data for test given the index."""
+        #Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        #unsqueeze label to list
+        return results['imgs'], np.array([results['labels']])
+
+    def __len__(self):
+        """get the size of the dataset."""
+        return len(self.info)
+
+    def __getitem__(self, idx):
+        """ Get the sample for either training or testing given index"""
+        if self.test_mode:
+            return self.prepare_test(idx)
+        else:
+            return self.prepare_train(idx)
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/bmn_dataset.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/bmn_dataset.py
@ -0,0 +1,72 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import json
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class BMNDataset(BaseDataset):
+    """Video dataset for action localization.
+    """
+    def __init__(
+        self,
+        file_path,
+        pipeline,
+        subset,
+        **kwargs,
+    ):
+        self.subset = subset
+        super().__init__(file_path, pipeline, **kwargs)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        annos = json.load(open(self.file_path))
+        for video_name in annos.keys():
+            video_subset = annos[video_name]["subset"]
+            if self.subset in video_subset:
+                info.append(
+                    dict(
+                        video_name=video_name,
+                        video_info=annos[video_name],
+                    ))
+        #sort by video_name
+        sort_f = lambda elem: elem['video_name']
+        info.sort(key=sort_f)
+        #add video_idx to info
+        for idx, elem in enumerate(info):
+            info[idx]['video_idx'] = idx
+        logger.info("{} subset video numbers: {}".format(
+            self.subset, len(info)))
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID: Prepare data for training/valid given the index."""
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        return results['video_feat'], results['gt_iou_map'], results['gt_start'],\
+               results['gt_end']
+
+    def prepare_test(self, idx):
+        """TEST: Prepare the data for test given the index."""
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        return results['video_feat'], results['gt_iou_map'], results['gt_start'], \
+               results['gt_end'], results['video_idx']
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/davis_dataset.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/davis_dataset.py
@ -0,0 +1,189 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import os.path as osp
+import copy
+import random
+import numpy as np
+import shutil
+from PIL import Image
+import cv2
+from paddle.io import Dataset
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+class VOS_Test(Dataset):
+    """process frames in each video
+    """
+    def __init__(self,
+                 image_root,
+                 label_root,
+                 seq_name,
+                 images,
+                 labels,
+                 pipeline=None,
+                 rgb=False,
+                 resolution=None):
+        self.image_root = image_root
+        self.label_root = label_root
+        self.seq_name = seq_name
+        self.images = images  # image file list
+        self.labels = labels
+        self.obj_num = 1
+        self.num_frame = len(self.images)
+        self.pipeline = pipeline
+        self.rgb = rgb
+        self.resolution = resolution
+
+        self.obj_nums = []
+        temp_obj_num = 0
+        for img_name in self.images:
+            self.obj_nums.append(temp_obj_num)
+            current_label_name = img_name.split('.')[0] + '.png'
+            if current_label_name in self.labels:
+                current_label = self.read_label(current_label_name)
+                if temp_obj_num < np.unique(
+                        current_label)[-1]:  #get object number from label_id
+                    temp_obj_num = np.unique(current_label)[-1]
+
+    def __len__(self):
+        return len(self.images)
+
+    def read_image(self, idx):
+        img_name = self.images[idx]
+        img_path = os.path.join(self.image_root, self.seq_name, img_name)
+        img = cv2.imread(img_path)
+        img = np.array(img, dtype=np.float32)
+        if self.rgb:
+            img = img[:, :, [2, 1, 0]]
+        return img
+
+    def read_label(self, label_name):
+        label_path = os.path.join(self.label_root, self.seq_name, label_name)
+        label = Image.open(label_path)
+        label = np.array(label, dtype=np.uint8)
+        return label
+
+    def __getitem__(self, idx):
+        img_name = self.images[idx]
+        current_img = self.read_image(idx)
+        current_img = np.array(current_img)
+        height, width, channels = current_img.shape
+        if self.resolution is not None:
+            width = int(np.ceil(float(width) * self.resolution / float(height)))
+            height = int(self.resolution)
+
+        current_label_name = img_name.split('.')[0] + '.png'
+        obj_num = self.obj_nums[idx]
+
+        if current_label_name in self.labels:
+            current_label = self.read_label(current_label_name)
+            current_label = np.array(current_label)
+            sample = {
+                'current_img': current_img,
+                'current_label': current_label
+            }
+        else:
+            sample = {
+                'current_img': current_img
+            }  #only the first frame contains label
+
+        sample['meta'] = {
+            'seq_name': self.seq_name,
+            'frame_num': self.num_frame,
+            'obj_num': obj_num,
+            'current_name': img_name,
+            'height': height,
+            'width': width,
+            'flip': False
+        }
+        if self.pipeline is not None:
+            sample = self.pipeline(sample)
+        for s in sample:
+            s['current_img'] = np.array(s['current_img'])
+            if 'current_label' in s.keys():
+                s['current_label'] = s['current_label']
+        return sample
+
+
+@DATASETS.register()
+class DavisDataset(BaseDataset):
+    """Davis 2017 dataset.
+    """
+    def __init__(
+        self,
+        file_path,
+        result_root,
+        pipeline,
+        data_prefix=None,
+        test_mode=False,
+        year=2017,
+        rgb=False,
+        resolution='480p',
+    ):
+        self.rgb = rgb
+        self.result_root = result_root
+        self.resolution = resolution
+        self.year = year
+        self.spt = 'val' if test_mode else 'train'
+        super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+    def load_file(self):
+        self.image_root = os.path.join(self.file_path, 'JPEGImages',
+                                       self.resolution)
+        self.label_root = os.path.join(self.file_path, 'Annotations',
+                                       self.resolution)
+        seq_names = []
+        with open(
+                os.path.join(self.file_path, 'ImageSets', str(self.year),
+                             self.spt + '.txt')) as f:
+            seqs_tmp = f.readlines()
+        seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))
+        seq_names.extend(seqs_tmp)
+        self.info = list(np.unique(seq_names))
+        return self.info
+
+    def prepare_test(self, idx):
+        seq_name = self.info[idx]  #video name
+        images = list(
+            np.sort(os.listdir(os.path.join(self.image_root, seq_name))))
+        labels = [images[0].replace('jpg', 'png')]  #we have first frame target
+
+        # copy first frame target
+        if not os.path.isfile(
+                os.path.join(self.result_root, seq_name, labels[0])):
+            if not os.path.exists(os.path.join(self.result_root, seq_name)):
+                os.makedirs(os.path.join(self.result_root, seq_name))
+            source_label_path = os.path.join(self.label_root, seq_name,
+                                             labels[0])
+            result_label_path = os.path.join(self.result_root, seq_name,
+                                             labels[0])
+
+            shutil.copy(source_label_path, result_label_path)
+
+        seq_dataset = VOS_Test(self.image_root,
+                               self.label_root,
+                               seq_name,
+                               images,
+                               labels,
+                               self.pipeline,
+                               rgb=self.rgb,
+                               resolution=480)
+        return seq_dataset
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/feature.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/feature.py
@ -0,0 +1,80 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import os.path as osp
+
+from ..registry import DATASETS
+from .base import BaseDataset
+
+
+@DATASETS.register()
+class FeatureDataset(BaseDataset):
+    """Feature dataset for action recognition
+       Example:(TODO)
+       Args:(TODO)
+    """
+    def __init__(
+        self,
+        file_path,
+        pipeline,
+        data_prefix=None,
+        test_mode=False,
+        suffix=None,
+    ):
+        self.suffix = suffix
+        super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                filename = line.strip().split()[0]
+                if self.data_prefix is not None:
+                    filename = osp.join(self.data_prefix, filename)
+                if self.suffix is not None:
+                    filename = filename + self.suffix
+
+                info.append(dict(filename=filename))
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID. Prepare the data for training/valid given the index."""
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+
+        if 'iou_norm' in results:
+            return results['rgb_data'], results['rgb_len'], results[
+                'rgb_mask'], results['audio_data'], results[
+                    'audio_len'], results['audio_mask'], results[
+                        'labels'], results['iou_norm']
+        else:
+            return results['rgb_data'], results['rgb_len'], results[
+                'rgb_mask'], results['audio_data'], results[
+                    'audio_len'], results['audio_mask'], results['labels']
+
+    def prepare_test(self, idx):
+        """TEST. Prepare the data for testing given the index."""
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+
+        if 'iou_norm' in results:
+            return results['rgb_data'], results['rgb_len'], results[
+                'rgb_mask'], results['audio_data'], results[
+                    'audio_len'], results['audio_mask'], results[
+                        'labels'], results['iou_norm']
+        else:
+            return results['rgb_data'], results['rgb_len'], results[
+                'rgb_mask'], results['audio_data'], results[
+                    'audio_len'], results['audio_mask'], results['labels']
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/frame.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/frame.py
@ -0,0 +1,177 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class FrameDataset(BaseDataset):
+    """Rawframe dataset for action recognition.
+    The dataset loads raw frames from frame files, and apply specified transform operatation them.
+    The indecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.
+    Example of an index file:
+
+    .. code-block:: txt
+
+        file_path-1 150 1
+        file_path-2 160 1
+        file_path-3 170 2
+        file_path-4 180 2
+
+    Args:
+        file_path (str): Path to the index file.
+        pipeline(XXX):
+        data_prefix (str): directory path of the data. Default: None.
+        test_mode (bool): Whether to bulid the test dataset. Default: False.
+        suffix (str): suffix of file. Default: 'img_{:05}.jpg'.
+
+    """
+    def __init__(self,
+                 file_path,
+                 pipeline,
+                 num_retries=5,
+                 data_prefix=None,
+                 test_mode=False,
+                 suffix='img_{:05}.jpg'):
+        self.num_retries = num_retries
+        self.suffix = suffix
+        super().__init__(file_path, pipeline, data_prefix, test_mode)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split()
+                frame_dir, frames_len, labels = line_split
+                if self.data_prefix is not None:
+                    frame_dir = osp.join(self.data_prefix, frame_dir)
+                info.append(
+                    dict(frame_dir=frame_dir,
+                         suffix=self.suffix,
+                         frames_len=frames_len,
+                         labels=int(labels)))
+        return info
+
+    def prepare_train(self, idx):
+        """Prepare the frames for training/valid given index. """
+        #Try to catch Exception caused by reading missing frames files
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['frame_dir'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'], np.array([results['labels']])
+
+    def prepare_test(self, idx):
+        """Prepare the frames for test given index. """
+        #Try to catch Exception caused by reading missing frames files
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['frame_dir'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'], np.array([results['labels']])
+
+
+@DATASETS.register()
+class FrameDataset_Sport(BaseDataset):
+    """Video dataset for action recognition
+       The dataset loads raw videos and apply specified transforms on them.
+       The index file is a file with multiple lines, and each line indicates
+       a sample video with the filepath and label, which are split with a whitesapce.
+       Example of a inde file:
+       .. code-block:: txt
+           path/000.mp4 1
+           path/001.mp4 1
+           path/002.mp4 2
+           path/003.mp4 2
+       Args:
+           file_path(str): Path to the index file.
+           pipeline(XXX): A sequence of data transforms.
+           **kwargs: Keyword arguments for ```BaseDataset```.
+    """
+    def __init__(self, file_path, pipeline, num_retries=5, suffix='', **kwargs):
+        self.num_retries = num_retries
+        self.suffix = suffix
+        super().__init__(file_path, pipeline, **kwargs)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split()
+                frame_dir = line_split[0]
+                if self.data_prefix is not None:
+                    frame_dir = osp.join(self.data_prefix, frame_dir)
+                info.append(dict(frame_dir=frame_dir, suffix=self.suffix))
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID. Prepare the data for training/valid given the index."""
+        #Try to catch Exception caused by reading corrupted video file
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['filename'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'], np.array([results['labels']])
+
+    def prepare_test(self, idx):
+        """TEST. Prepare the data for test given the index."""
+        #Try to catch Exception caused by reading corrupted video file
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['filename'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'], np.array([results['labels']])
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/ms_tcn_dataset.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/ms_tcn_dataset.py
@ -0,0 +1,110 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+
+import os
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class MSTCNDataset(BaseDataset):
+    """Video dataset for action segmentation.
+    """
+
+    def __init__(
+        self,
+        file_path,
+        pipeline,
+        feature_path,
+        gt_path,
+        actions_map_file_path,
+        **kwargs,
+    ):
+        super().__init__(file_path, pipeline, **kwargs)
+        self.gt_path = gt_path
+        self.actions_map_file_path = actions_map_file_path
+        self.feature_path = feature_path
+
+        # actions dict generate
+        file_ptr = open(self.actions_map_file_path, 'r')
+        actions = file_ptr.read().split('\n')[:-1]
+        file_ptr.close()
+        self.actions_dict = dict()
+        for a in actions:
+            self.actions_dict[a.split()[1]] = int(a.split()[0])
+
+        self.num_classes = len(self.actions_dict.keys())
+
+    def load_file(self):
+        """Load index file to get video information."""
+        file_ptr = open(self.file_path, 'r')
+        info = file_ptr.read().split('\n')[:-1]
+        file_ptr.close()
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID: Prepare data for training/valid given the index."""
+        results = {}
+        video_name = self.info[idx]
+        # load video feature
+        file_name = video_name.split('.')[0] + ".npy"
+        feat_file_path = os.path.join(self.feature_path, file_name)
+        #TODO: check path
+        video_feat = np.load(feat_file_path)
+
+        # load label
+        target_file_path = os.path.join(self.gt_path, video_name)
+        file_ptr = open(target_file_path, 'r')
+        content = file_ptr.read().split('\n')[:-1]
+        classes = np.zeros(min(np.shape(video_feat)[1], len(content)), dtype='int64')
+        for i in range(len(classes)):
+            classes[i] = self.actions_dict[content[i]]
+        # classes = classes * (-100)
+
+        results['video_feat'] = copy.deepcopy(video_feat)
+        results['video_gt'] = copy.deepcopy(classes)
+
+        results = self.pipeline(results)
+        return results['video_feat'], results['video_gt']
+
+    def prepare_test(self, idx):
+        """TEST: Prepare the data for test given the index."""
+        results = {}
+        video_name = self.info[idx]
+        # load video feature
+        file_name = video_name.split('.')[0] + ".npy"
+        feat_file_path = os.path.join(self.feature_path, file_name)
+        #TODO: check path
+        video_feat = np.load(feat_file_path)
+
+        # load label
+        target_file_path = os.path.join(self.gt_path, video_name)
+        file_ptr = open(target_file_path, 'r')
+        content = file_ptr.read().split('\n')[:-1]
+        classes = np.zeros(min(np.shape(video_feat)[1], len(content)))
+        for i in range(len(classes)):
+            classes[i] = self.actions_dict[content[i]]
+        # classes = classes * (-100)
+
+        results['video_feat'] = copy.deepcopy(video_feat)
+        results['video_gt'] = copy.deepcopy(classes)
+
+        results = self.pipeline(results)
+        return results['video_feat'], results['video_gt']
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/msrvtt.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/msrvtt.py
@ -0,0 +1,220 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+try:
+    import lmdb
+except ImportError as e:
+    print(
+        f"Warning! {e}, [lmdb] package and it's dependencies is required for ActBERT."
+    )
+import pickle
+try:
+    from paddlenlp.transformers import BertTokenizer
+except ImportError as e:
+    print(
+        f"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT."
+    )
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class MSRVTTDataset(BaseDataset):
+    """MSR-VTT dataset for text-video clip retrieval.
+    """
+    def __init__(
+        self,
+        file_path,
+        pipeline,
+        features_path,
+        bert_model="bert-base-uncased",
+        padding_index=0,
+        max_seq_length=36,
+        max_region_num=36,
+        max_action_num=5,
+        vision_feature_dim=2048,
+        action_feature_dim=2048,
+        spatials_dim=5,
+        data_prefix=None,
+        test_mode=False,
+    ):
+        self.features_path = features_path
+        self.bert_model = bert_model
+        self.padding_index = padding_index
+        self.max_seq_length = max_seq_length
+        self.max_region_num = max_region_num
+        self._max_action_num = max_action_num
+        self.vision_feature_dim = vision_feature_dim
+        self.action_feature_dim = action_feature_dim
+        self.spatials_dim = spatials_dim
+        self._tokenizer = BertTokenizer.from_pretrained(bert_model,
+                                                        do_lower_case=True)
+        super().__init__(file_path, pipeline, data_prefix, test_mode)
+        self.tokenize()
+        self.gen_feature()
+
+    def load_file(self):
+        """Load index file to get video information."""
+        with open(self.file_path) as fin:
+            self.image_entries = []
+            self.caption_entries = []
+            for line in fin.readlines():
+                line = line.strip()
+                vid_id = line.split(',')[0]
+                self.image_entries.append(vid_id)
+                self.caption_entries.append({
+                    "caption": line.split(',')[1],
+                    "vid_id": vid_id
+                })
+        self.env = lmdb.open(self.features_path)
+
+    def tokenize(self):
+        for entry in self.caption_entries:
+            tokens = []
+            tokens.append("[CLS]")
+            for token in self._tokenizer.tokenize(entry["caption"]):
+                tokens.append(token)
+            tokens.append("[SEP]")
+            tokens = self._tokenizer.convert_tokens_to_ids(tokens)
+
+            segment_ids = [0] * len(tokens)
+            input_mask = [1] * len(tokens)
+
+            if len(tokens) < self.max_seq_length:
+                padding = [self.padding_index
+                           ] * (self.max_seq_length - len(tokens))
+                tokens = tokens + padding
+                input_mask += padding
+                segment_ids += padding
+
+            entry["token"] = np.array(tokens).astype('int64')
+            entry["input_mask"] = np.array(input_mask)
+            entry["segment_ids"] = np.array(segment_ids).astype('int64')
+
+    def get_image_feature(self, video_id):
+        video_id = str(video_id).encode()
+        with self.env.begin(write=False) as txn:
+            item = pickle.loads(txn.get(video_id))
+            video_id = item["video_id"]
+            image_h = int(item["image_h"])
+            image_w = int(item["image_w"])
+
+            features = item["features"].reshape(-1, self.vision_feature_dim)
+            boxes = item["boxes"].reshape(-1, 4)
+
+            num_boxes = features.shape[0]
+            g_feat = np.sum(features, axis=0) / num_boxes
+            num_boxes = num_boxes + 1
+            features = np.concatenate(
+                [np.expand_dims(g_feat, axis=0), features], axis=0)
+
+            action_features = item["action_features"].reshape(
+                -1, self.action_feature_dim)
+
+            image_location = np.zeros((boxes.shape[0], self.spatials_dim),
+                                      dtype=np.float32)
+            image_location[:, :4] = boxes
+            image_location[:,
+                           4] = ((image_location[:, 3] - image_location[:, 1]) *
+                                 (image_location[:, 2] - image_location[:, 0]) /
+                                 (float(image_w) * float(image_h)))
+
+            image_location[:, 0] = image_location[:, 0] / float(image_w)
+            image_location[:, 1] = image_location[:, 1] / float(image_h)
+            image_location[:, 2] = image_location[:, 2] / float(image_w)
+            image_location[:, 3] = image_location[:, 3] / float(image_h)
+
+            g_location = np.array([0, 0, 1, 1, 1])
+            image_location = np.concatenate(
+                [np.expand_dims(g_location, axis=0), image_location], axis=0)
+        return features, num_boxes, image_location, action_features
+
+    def gen_feature(self):
+        num_inst = len(self.image_entries)  #1000
+        self.features_all = np.zeros(
+            (num_inst, self.max_region_num, self.vision_feature_dim))
+        self.action_features_all = np.zeros(
+            (num_inst, self._max_action_num, self.action_feature_dim))
+        self.spatials_all = np.zeros(
+            (num_inst, self.max_region_num, self.spatials_dim))
+        self.image_mask_all = np.zeros((num_inst, self.max_region_num))
+        self.action_mask_all = np.zeros((num_inst, self._max_action_num))
+
+        for i, image_id in enumerate(self.image_entries):
+            features, num_boxes, boxes, action_features = self.get_image_feature(
+                image_id)
+
+            mix_num_boxes = min(int(num_boxes), self.max_region_num)
+            mix_boxes_pad = np.zeros((self.max_region_num, self.spatials_dim))
+            mix_features_pad = np.zeros(
+                (self.max_region_num, self.vision_feature_dim))
+
+            image_mask = [1] * (int(mix_num_boxes))
+            while len(image_mask) < self.max_region_num:
+                image_mask.append(0)
+            action_mask = [1] * (self._max_action_num)
+            while len(action_mask) < self._max_action_num:
+                action_mask.append(0)
+
+            mix_boxes_pad[:mix_num_boxes] = boxes[:mix_num_boxes]
+            mix_features_pad[:mix_num_boxes] = features[:mix_num_boxes]
+
+            self.features_all[i] = mix_features_pad
+            x = action_features.shape[0]
+            self.action_features_all[i][:x] = action_features[:]
+            self.image_mask_all[i] = np.array(image_mask)
+            self.action_mask_all[i] = np.array(action_mask)
+            self.spatials_all[i] = mix_boxes_pad
+
+        self.features_all = self.features_all.astype("float32")
+        self.action_features_all = self.action_features_all.astype("float32")
+        self.image_mask_all = self.image_mask_all.astype("int64")
+        self.action_mask_all = self.action_mask_all.astype("int64")
+        self.spatials_all = self.spatials_all.astype("float32")
+
+    def prepare_train(self, idx):
+        pass
+
+    def prepare_test(self, idx):
+        entry = self.caption_entries[idx]
+        caption = entry["token"]
+        input_mask = entry["input_mask"]
+        segment_ids = entry["segment_ids"]
+
+        target_all = np.zeros(1000)
+        for i, image_id in enumerate(self.image_entries):
+            if image_id == entry["vid_id"]:
+                target_all[i] = 1
+
+        return (
+            caption,
+            self.action_features_all,
+            self.features_all,
+            self.spatials_all,
+            segment_ids,
+            input_mask,
+            self.image_mask_all,
+            self.action_mask_all,
+            target_all,
+        )
+
+    def __len__(self):
+        return len(self.caption_entries)
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/oxford.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/oxford.py
@ -0,0 +1,62 @@
+# Copyright Niantic 2019. Patent Pending. All rights reserved.
+#
+# This software is licensed under the terms of the Monodepth2 licence
+# which allows for non-commercial use only, the full terms of which are made
+# available in the LICENSE file.
+
+from __future__ import absolute_import, division, print_function
+
+import copy
+from os import path as osp
+
+from PIL import Image
+
+from ..registry import DATASETS
+from .base import BaseDataset
+
+
+def pil_loader(path):
+    # open path as file to avoid ResourceWarning
+    # (https://github.com/python-pillow/Pillow/issues/835)
+    with open(path, 'rb') as f:
+        with Image.open(f) as img:
+            return img.convert('RGB')
+
+
+@DATASETS.register()
+class MonoDataset(BaseDataset):
+    def __init__(self,
+                 file_path,
+                 data_prefix,
+                 pipeline,
+                 num_retries=0,
+                 suffix='.png',
+                 **kwargs):
+        self.num_retries = num_retries
+        self.suffix = suffix
+        super().__init__(file_path, pipeline, data_prefix, **kwargs)
+
+    def load_file(self):
+        info = []
+        with open(self.file_path, 'r') as f:
+            for line in f:
+                filename = line.strip() + self.suffix
+                folder = osp.dirname(filename)
+                frame_index = line.strip().split('/')[1]
+                info.append(
+                    dict(data_path=self.data_prefix,
+                         filename=filename,
+                         folder=folder,
+                         frame_index=int(frame_index)))
+        return info
+
+    def prepare_train(self, idx):
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        results['imgs']['idx'] = idx
+        return results['imgs'], results['day_or_night']
+
+    def prepare_test(self, idx):
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        return results['imgs'], results['day_or_night']
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/skeleton.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/skeleton.py
@ -0,0 +1,78 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+import pickle
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class SkeletonDataset(BaseDataset):
+    """
+    Skeleton dataset for action recognition.
+    The dataset loads skeleton feature, and apply norm operatations.
+    Args:
+        file_path (str): Path to the index file.
+        pipeline(obj): Define the pipeline of data preprocessing.
+        data_prefix (str): directory path of the data. Default: None.
+        test_mode (bool): Whether to bulid the test dataset. Default: False.
+    """
+    def __init__(self, file_path, pipeline, label_path=None, test_mode=False):
+        self.label_path = label_path
+        super().__init__(file_path, pipeline, test_mode=test_mode)
+
+    def load_file(self):
+        """Load feature file to get skeleton information."""
+        logger.info("Loading data, it will take some moment...")
+        self.data = np.load(self.file_path)
+        if self.label_path:
+            if self.label_path.endswith('npy'):
+                self.label = np.load(self.label_path)
+            elif self.label_path.endswith('pkl'):
+                with open(self.label_path, 'rb') as f:
+                    sample_name, self.label = pickle.load(f)
+        else:
+            logger.info(
+                "Label path not provided when test_mode={}, here just output predictions."
+                .format(self.test_mode))
+        logger.info("Data Loaded!")
+        return self.data  # used for __len__
+
+    def prepare_train(self, idx):
+        """Prepare the feature for training/valid given index. """
+        results = dict()
+        results['data'] = copy.deepcopy(self.data[idx])
+        results['label'] = copy.deepcopy(self.label[idx])
+        results = self.pipeline(results)
+        return results['data'], results['label']
+
+    def prepare_test(self, idx):
+        """Prepare the feature for test given index. """
+        results = dict()
+        results['data'] = copy.deepcopy(self.data[idx])
+        if self.label_path:
+            results['label'] = copy.deepcopy(self.label[idx])
+            results = self.pipeline(results)
+            return results['data'], results['label']
+        else:
+            results = self.pipeline(results)
+            return [results['data']]
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/slowfast_video.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/slowfast_video.py
@ -0,0 +1,143 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+@DATASETS.register()
+class SFVideoDataset(BaseDataset):
+    """Video dataset for action recognition
+       The dataset loads raw videos and apply specified transforms on them.
+
+       The index file is a file with multiple lines, and each line indicates
+       a sample video with the filepath and label, which are split with a whitesapce.
+       Example of a inde file:
+
+       .. code-block:: txt
+
+           path/000.mp4 1
+           path/001.mp4 1
+           path/002.mp4 2
+           path/003.mp4 2
+
+       Args:
+           file_path(str): Path to the index file.
+           pipeline(XXX): A sequence of data transforms.
+           num_ensemble_views(int): temporal segment when multi-crop test
+           num_spatial_crops(int): spatial crop number when multi-crop test
+           **kwargs: Keyword arguments for ```BaseDataset```.
+
+    """
+    def __init__(
+        self,
+        file_path,
+        pipeline,
+        num_ensemble_views=1,
+        num_spatial_crops=1,
+        num_retries=5,
+        num_samples_precise_bn=None,
+        **kwargs,
+    ):
+        self.num_ensemble_views = num_ensemble_views
+        self.num_spatial_crops = num_spatial_crops
+        self.num_retries = num_retries
+        self.num_samples_precise_bn = num_samples_precise_bn
+        super().__init__(file_path, pipeline, **kwargs)
+        #set random seed
+        random.seed(0)
+        np.random.seed(0)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split()
+                filename, labels = line_split
+                if self.data_prefix is not None:
+                    filename = osp.join(self.data_prefix, filename)
+                for tidx in range(self.num_ensemble_views):
+                    for sidx in range(self.num_spatial_crops):
+                        info.append(
+                            dict(
+                                filename=filename,
+                                labels=int(labels),
+                                temporal_sample_index=tidx,
+                                spatial_sample_index=sidx,
+                                temporal_num_clips=self.num_ensemble_views,
+                                spatial_num_clips=self.num_spatial_crops,
+                            ))
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID. Prepare the data for training given the index."""
+        #Try to catch Exception caused by reading corrupted video file
+        short_cycle = False
+        if isinstance(idx, tuple):
+            idx, short_cycle_idx = idx
+            short_cycle = True
+        for ir in range(self.num_retries):
+            try:
+                #Multi-grid short cycle
+                if short_cycle:
+                    results = copy.deepcopy(self.info[idx])
+                    results['short_cycle_idx'] = short_cycle_idx
+                else:
+                    results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['filename'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+
+            return results['imgs'][0], results['imgs'][1], np.array(
+                [results['labels']])
+
+    def prepare_test(self, idx):
+        """TEST. Prepare the data for test given the index."""
+        #Try to catch Exception caused by reading corrupted video file
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['filename'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'][0], results['imgs'][1], np.array(
+                [results['labels']]), np.array([idx])
+
+    def __len__(self):
+        """get the size of the dataset."""
+        if self.num_samples_precise_bn is None:
+            return len(self.info)
+        else:
+            random.shuffle(self.info)
+            return min(self.num_samples_precise_bn, len(self.info))
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/ucf101_skeleton.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/ucf101_skeleton.py
@ -0,0 +1,89 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+import pickle
+
+import paddle
+from paddle.io import Dataset
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class UCF101SkeletonDataset(BaseDataset):
+    """
+    Skeleton dataset for action recognition.
+    The dataset loads skeleton feature, and apply norm operatations.
+    Args:
+        file_path (str): Path to the index file.
+        pipeline(obj): Define the pipeline of data preprocessing.
+        test_mode (bool): Whether to bulid the test dataset. Default: False.
+    """
+
+    def __init__(self,
+                 file_path,
+                 pipeline,
+                 split,
+                 repeat_times,
+                 test_mode=False):
+        self.split = split
+        self.repeat_times = repeat_times
+        super().__init__(file_path, pipeline, test_mode=test_mode)
+        self._ori_len = len(self.info)
+        self.start_index = 0
+        self.modality = "Pose"
+
+    def load_file(self):
+        """Load annotation file to get video information."""
+        assert self.file_path.endswith('.pkl')
+        return self.load_pkl_annotations()
+
+    def load_pkl_annotations(self):
+        with open(self.file_path, "rb") as f:
+            data = pickle.load(f)
+
+        if self.split:
+            split, data = data['split'], data['annotations']
+            identifier = 'filename' if 'filename' in data[0] else 'frame_dir'
+            data = [x for x in data if x[identifier] in split[self.split]]
+
+        return data
+
+    def prepare_train(self, idx):
+        """Prepare the frames for training given the index."""
+        results = copy.deepcopy(self.info[idx % self._ori_len])
+        results['modality'] = self.modality
+        results['start_index'] = self.start_index
+
+        return self.pipeline(results)
+
+    def prepare_test(self, idx):
+        """Prepare the frames for testing given the index."""
+        results = copy.deepcopy(self.info[idx % self._ori_len])
+        results['modality'] = self.modality
+        results['start_index'] = self.start_index
+
+        return self.pipeline(results)
+
+    def __len__(self):
+        """get the size of the dataset."""
+        return len(self.info) * self.repeat_times
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/ucf24_dataset.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/ucf24_dataset.py
@ -0,0 +1,76 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class UCF24Dataset(BaseDataset):
+    """Dataset for YOWO
+       The dataset loads raw videos and apply specified transforms on them.
+       The index file is a file with multiple lines, and each line indicates
+       a sample video with the filepath and label, which are split with a whitesapce.
+       Example of a inde file:
+       .. code-block:: txt
+
+       Args:
+           file_path(str): Path to the index file.
+           pipeline(XXX): A sequence of data transforms.
+           **kwargs: Keyword arguments for ```BaseDataset```.
+    """
+
+    def __init__(self, file_path, pipeline, num_retries=5, **kwargs):
+        self.num_retries = num_retries
+        super().__init__(file_path, pipeline, **kwargs)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            lines = fin.readlines()
+        for line in lines:
+            line = line.strip()  # 'data/ucf24/labels/class_name/video_name/key_frame.txt'
+            filename = line.replace('txt', 'jpg').replace(
+                'labels', 'rgb-images')  # key frame path
+
+            info.append(dict(filename=filename))
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID. Prepare the data for training/valid given the index."""
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        im_path = results['filename']
+        im_path = im_path.replace('jpg', 'txt')
+        im_split = im_path.split('/')
+        frame_index = im_split[3] + '_' + im_split[4] + '_' + im_split[5]
+        return results['imgs'], np.array([results['labels']]), frame_index
+
+    def prepare_test(self, idx):
+        """TEST. Prepare the data for test given the index."""
+        # Try to catch Exception caused by reading corrupted video file
+        results = copy.deepcopy(self.info[idx])
+        results = self.pipeline(results)
+        im_path = results['filename']
+        im_path = im_path.replace('jpg', 'txt')
+        im_split = im_path.split('/')
+        frame_index = im_split[3] + '_' + im_split[4] + '_' + im_split[5]
+        return results['imgs'], np.array([results['labels']]), frame_index
--- a/Bank_second_part/detect_process/paddlevideo/loader/dataset/video.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/video.py
@ -0,0 +1,95 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+import random
+import numpy as np
+
+from ..registry import DATASETS
+from .base import BaseDataset
+from ...utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+@DATASETS.register()
+class VideoDataset(BaseDataset):
+    """Video dataset for action recognition
+       The dataset loads raw videos and apply specified transforms on them.
+       The index file is a file with multiple lines, and each line indicates
+       a sample video with the filepath and label, which are split with a whitesapce.
+       Example of a inde file:
+       .. code-block:: txt
+           path/000.mp4 1
+           path/001.mp4 1
+           path/002.mp4 2
+           path/003.mp4 2
+       Args:
+           file_path(str): Path to the index file.
+           pipeline(XXX): A sequence of data transforms.
+           **kwargs: Keyword arguments for ```BaseDataset```.
+    """
+    def __init__(self, file_path, pipeline, num_retries=5, suffix='', **kwargs):
+        self.num_retries = num_retries
+        self.suffix = suffix
+        super().__init__(file_path, pipeline, **kwargs)
+
+    def load_file(self):
+        """Load index file to get video information."""
+        info = []
+        with open(self.file_path, 'r') as fin:
+            for line in fin:
+                line_split = line.strip().split()
+                filename, labels = line_split
+                #TODO(hj): Required suffix format: may mp4/avi/wmv
+                filename = filename + self.suffix
+                if self.data_prefix is not None:
+                    filename = osp.join(self.data_prefix, filename)
+                info.append(dict(filename=filename, labels=int(labels)))
+        return info
+
+    def prepare_train(self, idx):
+        """TRAIN & VALID. Prepare the data for training/valid given the index."""
+        #Try to catch Exception caused by reading corrupted video file
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['filename'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'], np.array([results['labels']])
+
+    def prepare_test(self, idx):
+        """TEST. Prepare the data for test given the index."""
+        #Try to catch Exception caused by reading corrupted video file
+        for ir in range(self.num_retries):
+            try:
+                results = copy.deepcopy(self.info[idx])
+                results = self.pipeline(results)
+            except Exception as e:
+                #logger.info(e)
+                if ir < self.num_retries - 1:
+                    logger.info(
+                        "Error when loading {}, have {} trys, will try again".
+                        format(results['filename'], ir))
+                idx = random.randint(0, len(self.info) - 1)
+                continue
+            return results['imgs'], np.array([results['labels']])
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/init.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/init.py
@ -0,0 +1,56 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .anet_pipeline import GetMatchMap, GetVideoLabel, LoadFeat
+from .augmentations import (CenterCrop, ColorJitter, GroupRandomFlip,
+                            GroupResize, Image2Array, JitterScale, MultiCrop,
+                            Normalization, PackOutput, RandomCrop, RandomFlip,
+                            RandomResizedCrop, Scale, TenCrop, ToArray,
+                            UniformCrop, RandomGamma, MultiCenterCrop,
+                            RandomBrightness, RandomHue, RandomSaturation, YowoAug)
+from .augmentations_ava import *
+from .compose import Compose
+from .decode import FeatureDecoder, FrameDecoder, VideoDecoder, ActionFeatureDecoder
+from .decode_image import ImageDecoder
+from .decode_sampler import DecodeSampler
+from .mix import Cutmix, Mixup, VideoMix
+from .multimodal import FeaturePadding, RandomCap, RandomMask, Tokenize
+from .sample import Sampler, SamplerPkl
+from .sample_ava import *
+from .segmentation import MultiNorm, MultiRestrictSize
+from .skeleton_pipeline import AutoPadding, Iden, SkeletonNorm
+from .skeleton_pipeline import SketeonCropSample, SketeonModalityTransform, RandomRotation
+from .skeleton_pipeline import (UniformSampleFrames, PoseDecode, PoseCompact,
+                                RandomResizedCrop_V2, Flip_V2, CenterCrop_V2,
+                                GeneratePoseTarget, FormatShape, Collect)
+from .decode_sampler_MRI import SFMRI_DecodeSampler
+from .segmentation_pipline import SegmentationSampler
+from .sample_ucf24 import SamplerUCF24
+
+__all__ = [
+    'ImageDecoder', 'RandomMask', 'UniformCrop', 'SkeletonNorm', 'Tokenize',
+    'Sampler', 'FeatureDecoder', 'DecodeSampler', 'TenCrop', 'Compose',
+    'AutoPadding', 'Normalization', 'Mixup', 'Image2Array', 'Scale',
+    'GroupResize', 'VideoDecoder', 'FrameDecoder', 'PackOutput',
+    'ActionFeatureDecoder', 'GetVideoLabel', 'Cutmix', 'CenterCrop',
+    'RandomCrop', 'LoadFeat', 'RandomCap', 'JitterScale', 'Iden', 'VideoMix',
+    'ColorJitter', 'RandomFlip', 'ToArray', 'FeaturePadding', 'GetMatchMap',
+    'GroupRandomFlip', 'MultiCrop', 'SFMRI_DecodeSampler', 'MultiRestrictSize',
+    'MultiNorm', 'RandomResizedCrop', 'SamplerPkl', 'SegmentationSampler',
+    'SketeonCropSample', 'SketeonModalityTransform', 'RandomRotation',
+    'RandomGamma', 'MultiCenterCrop', 'RandomBrightness', 'RandomHue',
+    'RandomSaturation', 'UniformSampleFrames', 'PoseDecode', 'PoseCompact',
+    'Resize', 'RandomResizedCrop_V2', 'Flip_V2', 'GeneratePoseTarget',
+    'FormatShape', 'Collect', 'RandomSaturation', 'SamplerUCF24', 'YowoAug'
+]
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/init.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/init.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/anet_pipeline.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/anet_pipeline.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/augmentations.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/augmentations.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/augmentations_ava.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/augmentations_ava.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/compose.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/compose.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/decode.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/decode.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/decode_image.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/decode_image.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/decode_sampler.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/decode_sampler.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/decode_sampler_MRI.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/decode_sampler_MRI.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/mix.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/mix.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/multimodal.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/multimodal.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/sample.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/sample.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/sample_ava.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/sample_ava.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/sample_ucf24.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/sample_ucf24.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/segmentation.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/segmentation.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/segmentation_pipline.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/segmentation_pipline.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/skeleton_pipeline.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/pycache/skeleton_pipeline.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/anet_pipeline.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/anet_pipeline.py
@ -0,0 +1,150 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+from ..registry import PIPELINES
+"""pipeline ops for Activity Net.
+"""
+
+
+@PIPELINES.register()
+class LoadFeat(object):
+    def __init__(self, feat_path):
+        self.feat_path = feat_path
+
+    def __call__(self, results):
+        video_name = results['video_name']
+        file_name = video_name + ".npy"
+        file_path = os.path.join(self.feat_path, file_name)
+        #TODO: check path
+        video_feat = np.load(file_path)
+        video_feat = video_feat.T
+        video_feat = video_feat.astype("float32")
+        results['video_feat'] = video_feat
+        return results
+
+
+@PIPELINES.register()
+class GetMatchMap(object):
+    def __init__(self, tscale):
+        self.tscale = tscale
+        self.tgap = 1. / self.tscale
+
+    def __call__(self, results):
+        match_map = []
+        for idx in range(self.tscale):
+            tmp_match_window = []
+            xmin = self.tgap * idx
+            for jdx in range(1, self.tscale + 1):
+                xmax = xmin + self.tgap * jdx
+                tmp_match_window.append([xmin, xmax])
+            match_map.append(tmp_match_window)
+        match_map = np.array(match_map)
+        match_map = np.transpose(match_map, [1, 0, 2])
+        match_map = np.reshape(match_map, [-1, 2])
+
+        anchor_xmin = [self.tgap * i for i in range(self.tscale)]
+        anchor_xmax = [self.tgap * i for i in range(1, self.tscale + 1)]
+
+        results['match_map'] = match_map
+        results['anchor_xmin'] = anchor_xmin
+        results['anchor_xmax'] = anchor_xmax
+        return results
+
+
+@PIPELINES.register()
+class GetVideoLabel(object):
+    def __init__(self, tscale, dscale, datatype="float32"):
+        self.tscale = tscale
+        self.dscale = dscale
+        self.tgap = 1. / self.tscale
+        self.datatype = datatype
+
+    def iou_with_anchors(self, anchors_min, anchors_max, box_min, box_max):
+        """Compute jaccard score between a box and the anchors.
+        """
+        len_anchors = anchors_max - anchors_min
+        int_xmin = np.maximum(anchors_min, box_min)
+        int_xmax = np.minimum(anchors_max, box_max)
+        inter_len = np.maximum(int_xmax - int_xmin, 0.)
+        union_len = len_anchors - inter_len + box_max - box_min
+        jaccard = np.divide(inter_len, union_len)
+        return jaccard
+
+    def ioa_with_anchors(self, anchors_min, anchors_max, box_min, box_max):
+        """Compute intersection between score a box and the anchors.
+        """
+        len_anchors = anchors_max - anchors_min
+        int_xmin = np.maximum(anchors_min, box_min)
+        int_xmax = np.minimum(anchors_max, box_max)
+        inter_len = np.maximum(int_xmax - int_xmin, 0.)
+        scores = np.divide(inter_len, len_anchors)
+        return scores
+
+    def __call__(self, results):
+        video_info = results['video_info']
+        match_map = results['match_map']
+        anchor_xmin = results['anchor_xmin']
+        anchor_xmax = results['anchor_xmax']
+
+        video_second = video_info['duration_second']
+        video_labels = video_info['annotations']
+
+        gt_bbox = []
+        gt_iou_map = []
+        for gt in video_labels:
+            tmp_start = max(min(1, gt["segment"][0] / video_second), 0)
+            tmp_end = max(min(1, gt["segment"][1] / video_second), 0)
+            gt_bbox.append([tmp_start, tmp_end])
+            tmp_gt_iou_map = self.iou_with_anchors(match_map[:, 0],
+                                                   match_map[:, 1], tmp_start,
+                                                   tmp_end)
+            tmp_gt_iou_map = np.reshape(tmp_gt_iou_map,
+                                        [self.dscale, self.tscale])
+            gt_iou_map.append(tmp_gt_iou_map)
+        gt_iou_map = np.array(gt_iou_map)
+        gt_iou_map = np.max(gt_iou_map, axis=0)
+
+        gt_bbox = np.array(gt_bbox)
+        gt_xmins = gt_bbox[:, 0]
+        gt_xmaxs = gt_bbox[:, 1]
+        gt_len_small = 3 * self.tgap
+        gt_start_bboxs = np.stack(
+            (gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1)
+        gt_end_bboxs = np.stack(
+            (gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1)
+
+        match_score_start = []
+        for jdx in range(len(anchor_xmin)):
+            match_score_start.append(
+                np.max(
+                    self.ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx],
+                                          gt_start_bboxs[:, 0],
+                                          gt_start_bboxs[:, 1])))
+        match_score_end = []
+        for jdx in range(len(anchor_xmin)):
+            match_score_end.append(
+                np.max(
+                    self.ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx],
+                                          gt_end_bboxs[:, 0], gt_end_bboxs[:,
+                                                                           1])))
+
+        gt_start = np.array(match_score_start)
+        gt_end = np.array(match_score_end)
+
+        results['gt_iou_map'] = gt_iou_map.astype(self.datatype)
+        results['gt_start'] = gt_start.astype(self.datatype)
+        results['gt_end'] = gt_end.astype(self.datatype)
+        return results
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/augmentations.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/augmentations.py
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/augmentations_ava.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/augmentations_ava.py
@ -0,0 +1,749 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import numpy as np
+import math
+from PIL import Image
+from ..registry import PIPELINES
+from collections.abc import Sequence
+import cv2
+
+pillow_interp_codes = {
+    'nearest': Image.NEAREST,
+    'bilinear': Image.BILINEAR,
+    'bicubic': Image.BICUBIC,
+    'box': Image.BOX,
+    'lanczos': Image.LANCZOS,
+    'hamming': Image.HAMMING
+}
+
+cv2_interp_codes = {
+    'nearest': cv2.INTER_NEAREST,
+    'bilinear': cv2.INTER_LINEAR,
+    'bicubic': cv2.INTER_CUBIC,
+    'area': cv2.INTER_AREA,
+    'lanczos': cv2.INTER_LANCZOS4
+}
+
+
+def _init_lazy_if_proper(results, lazy):
+    """Initialize lazy operation properly.
+
+    Make sure that a lazy operation is properly initialized,
+    and avoid a non-lazy operation accidentally getting mixed in.
+
+    Required keys in results are "imgs" if "img_shape" not in results,
+    otherwise, Required keys in results are "img_shape", add or modified keys
+    are "img_shape", "lazy".
+    Add or modified keys in "lazy" are "original_shape", "crop_bbox", "flip",
+    "flip_direction", "interpolation".
+
+    Args:
+        results (dict): A dict stores data pipeline result.
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+
+    if 'img_shape' not in results:
+        results['img_shape'] = results['imgs'][0].shape[:2]
+    if lazy:
+        if 'lazy' not in results:
+            img_h, img_w = results['img_shape']
+            lazyop = dict()
+            lazyop['original_shape'] = results['img_shape']
+            lazyop['crop_bbox'] = np.array([0, 0, img_w, img_h],
+                                           dtype=np.float32)
+            lazyop['flip'] = False
+            lazyop['flip_direction'] = None
+            lazyop['interpolation'] = None
+            results['lazy'] = lazyop
+    else:
+        assert 'lazy' not in results, 'Use Fuse after lazy operations'
+
+
+def _scale_size(size, scale):
+    """Rescale a size by a ratio.
+
+    Args:
+        size (tuple[int]): (w, h).
+        scale (float): Scaling factor.
+
+    Returns:
+        tuple[int]: scaled size.
+    """
+    w, h = size
+    return int(w * float(scale) + 0.5), int(h * float(scale) + 0.5)
+
+
+def rescale_size(old_size, scale, return_scale=False):
+    """Calculate the new size to be rescaled to.
+
+    Args:
+        old_size (tuple[int]): The old size (w, h) of image.
+        scale (float | tuple[int]): The scaling factor or maximum size.
+            If it is a float number, then the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, then the image will
+            be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image size.
+
+    Returns:
+        tuple[int]: The new rescaled image size.
+    """
+    w, h = old_size
+    if isinstance(scale, (float, int)):
+        if scale <= 0:
+            raise ValueError(f'Invalid scale {scale}, must be positive.')
+        scale_factor = scale
+    elif isinstance(scale, tuple):
+        max_long_edge = max(scale)
+        max_short_edge = min(scale)
+        scale_factor = min(max_long_edge / max(h, w),
+                           max_short_edge / min(h, w))
+    else:
+        raise TypeError(
+            f'Scale must be a number or tuple of int, but got {type(scale)}')
+
+    new_size = _scale_size((w, h), scale_factor)
+
+    if return_scale:
+        return new_size, scale_factor
+    else:
+        return new_size
+
+
+def imresize(img,
+             size,
+             return_scale=False,
+             interpolation='bilinear',
+             out=None,
+             backend=None):
+    """Resize image to a given size.  """
+    h, w = img.shape[:2]
+    if backend is None:
+        backend = 'cv2'
+    if backend not in ['cv2', 'pillow']:
+        raise ValueError(f'backend: {backend} is not supported for resize.'
+                         f"Supported backends are 'cv2', 'pillow'")
+
+    if backend == 'pillow':
+        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+        pil_image = Image.fromarray(img)
+        pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])
+        resized_img = np.array(pil_image)
+    else:
+        resized_img = cv2.resize(
+            img, size, dst=out, interpolation=cv2_interp_codes[interpolation])
+    if not return_scale:
+        return resized_img
+    else:
+        w_scale = size[0] / w
+        h_scale = size[1] / h
+        return resized_img, w_scale, h_scale
+
+
+@PIPELINES.register()
+class EntityBoxRescale:
+    """Rescale the entity box and proposals according to the image shape.
+
+    Required keys are "proposals", "gt_bboxes", added or modified keys are
+    "gt_bboxes". If original "proposals" is not None, "proposals" and
+    will be added or modified.
+
+    Args:
+        scale_factor (np.ndarray): The scale factor used entity_box rescaling.
+    """
+
+    def __init__(self, scale_factor):
+        self.scale_factor = scale_factor
+
+    def __call__(self, results):
+        scale_factor = np.concatenate([self.scale_factor, self.scale_factor])
+
+        if 'gt_bboxes' in results:
+            gt_bboxes = results['gt_bboxes']
+            results['gt_bboxes'] = gt_bboxes * scale_factor
+
+        if 'proposals' in results:
+            proposals = results['proposals']
+            if proposals is not None:
+                assert proposals.shape[1] == 4, (
+                    'proposals shape should be in '
+                    f'(n, 4), but got {proposals.shape}')
+                results['proposals'] = proposals * scale_factor
+
+        return results
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}(scale_factor={self.scale_factor})'
+
+
+@PIPELINES.register()
+class EntityBoxCrop:
+    """Crop the entity boxes and proposals according to the cropped images.
+
+    Required keys are "proposals", "gt_bboxes", added or modified keys are
+    "gt_bboxes". If original "proposals" is not None, "proposals" will be
+    modified.
+
+    Args:
+        crop_bbox(np.ndarray | None): The bbox used to crop the original image.
+    """
+
+    def __init__(self, crop_bbox):
+        self.crop_bbox = crop_bbox
+
+    def __call__(self, results):
+        proposals = results['proposals']
+        gt_bboxes = results['gt_bboxes']
+
+        if self.crop_bbox is None:
+            return results
+
+        x1, y1, x2, y2 = self.crop_bbox
+        img_w, img_h = x2 - x1, y2 - y1
+
+        assert gt_bboxes.shape[-1] == 4
+        gt_bboxes_ = gt_bboxes.copy()
+        gt_bboxes_[..., 0::2] = np.clip(gt_bboxes[..., 0::2] - x1, 0, img_w - 1)
+        gt_bboxes_[..., 1::2] = np.clip(gt_bboxes[..., 1::2] - y1, 0, img_h - 1)
+        results['gt_bboxes'] = gt_bboxes_
+
+        if proposals is not None:
+            assert proposals.shape[-1] == 4
+            proposals_ = proposals.copy()
+            proposals_[..., 0::2] = np.clip(proposals[..., 0::2] - x1, 0,
+                                            img_w - 1)
+            proposals_[..., 1::2] = np.clip(proposals[..., 1::2] - y1, 0,
+                                            img_h - 1)
+            results['proposals'] = proposals_
+        return results
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}(crop_bbox={self.crop_bbox})'
+
+
+@PIPELINES.register()
+class EntityBoxFlip:
+    """Flip the entity boxes and proposals with a probability.
+
+    Reverse the order of elements in the given bounding boxes and proposals
+    with a specific direction. The shape of them are preserved, but the
+    elements are reordered. Only the horizontal flip is supported (seems
+    vertical flipping makes no sense). Required keys are "proposals",
+    "gt_bboxes", added or modified keys are "gt_bboxes". If "proposals"
+    is not None, it will also be modified.
+
+    Args:
+        img_shape (tuple[int]): The img shape.
+    """
+
+    def __init__(self, img_shape):
+        self.img_shape = img_shape
+
+    def __call__(self, results):
+        proposals = results['proposals']
+        gt_bboxes = results['gt_bboxes']
+        img_h, img_w = self.img_shape
+
+        assert gt_bboxes.shape[-1] == 4
+        gt_bboxes_ = gt_bboxes.copy()
+        gt_bboxes_[..., 0::4] = img_w - gt_bboxes[..., 2::4] - 1
+        gt_bboxes_[..., 2::4] = img_w - gt_bboxes[..., 0::4] - 1
+        if proposals is not None:
+            assert proposals.shape[-1] == 4
+            proposals_ = proposals.copy()
+            proposals_[..., 0::4] = img_w - proposals[..., 2::4] - 1
+            proposals_[..., 2::4] = img_w - proposals[..., 0::4] - 1
+        else:
+            proposals_ = None
+
+        results['proposals'] = proposals_
+        results['gt_bboxes'] = gt_bboxes_
+
+        return results
+
+    def __repr__(self):
+        repr_str = f'{self.__class__.__name__}(img_shape={self.img_shape})'
+        return repr_str
+
+
+@PIPELINES.register()
+class Resize:
+    """Resize images to a specific size.
+
+    Required keys are "imgs", "img_shape", "modality", added or modified
+    keys are "imgs", "img_shape", "keep_ratio", "scale_factor", "lazy",
+    "resize_size". Required keys in "lazy" is None, added or modified key is
+    "interpolation".
+
+    Args:
+        scale (float | Tuple[int]): If keep_ratio is True, it serves as scaling
+            factor or maximum size:
+            If it is a float number, the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, the image will
+            be rescaled as large as possible within the scale.
+            Otherwise, it serves as (w, h) of output size.
+        keep_ratio (bool): If set to True, Images will be resized without
+            changing the aspect ratio. Otherwise, it will resize images to a
+            given size. Default: True.
+        interpolation (str): Algorithm used for interpolation:
+            "nearest" | "bilinear". Default: "bilinear".
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+
+    def __init__(self,
+                 scale,
+                 keep_ratio=True,
+                 interpolation='bilinear',
+                 lazy=False):
+        if isinstance(scale, str):
+            scale = eval(scale)
+        if isinstance(scale, float):
+            if scale <= 0:
+                raise ValueError(f'Invalid scale {scale}, must be positive.')
+        elif isinstance(scale, tuple):
+            max_long_edge = max(scale)
+            max_short_edge = min(scale)
+            if max_short_edge == -1:
+                # assign np.inf to long edge for rescaling short edge later.
+                scale = (np.inf, max_long_edge)
+        else:
+            raise TypeError(
+                f'Scale must be float or tuple of int, but got {type(scale)}')
+        self.scale = scale
+        self.keep_ratio = keep_ratio
+        self.interpolation = interpolation
+        self.lazy = lazy
+
+    def __call__(self, results):
+        """Performs the Resize augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+
+        _init_lazy_if_proper(results, self.lazy)
+
+        if 'scale_factor' not in results:
+            results['scale_factor'] = np.array([1, 1], dtype=np.float32)
+        img_h, img_w = results['img_shape']
+
+        if self.keep_ratio:
+            new_w, new_h = rescale_size((img_w, img_h), self.scale)
+        else:
+            new_w, new_h = self.scale
+
+        self.scale_factor = np.array([new_w / img_w, new_h / img_h],
+                                     dtype=np.float32)
+        results['img_shape'] = (new_h, new_w)
+        results['keep_ratio'] = self.keep_ratio
+        results['scale_factor'] = results['scale_factor'] * self.scale_factor
+
+        if not self.lazy:
+            if 'imgs' in results:
+                results['imgs'] = [
+                    imresize(
+                        img, (new_w, new_h), interpolation=self.interpolation)
+                    for img in results['imgs']
+                ]
+            if 'keypoint' in results:
+                results['keypoint'] = results['keypoint'] * self.scale_factor
+        else:
+            lazyop = results['lazy']
+            if lazyop['flip']:
+                raise NotImplementedError('Put Flip at last for now')
+            lazyop['interpolation'] = self.interpolation
+
+        #if 'gt_bboxes' in results:
+        assert not self.lazy
+        entity_box_rescale = EntityBoxRescale(self.scale_factor)
+        results = entity_box_rescale(results)
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'scale={self.scale}, keep_ratio={self.keep_ratio}, '
+                    f'interpolation={self.interpolation}, '
+                    f'lazy={self.lazy})')
+        return repr_str
+
+
+@PIPELINES.register()
+class RandomRescale:
+    """Randomly resize images so that the short_edge is resized to a specific
+    size in a given range. The scale ratio is unchanged after resizing.
+    """
+
+    def __init__(self, scale_range, interpolation='bilinear'):
+        scale_range = eval(scale_range)
+        self.scale_range = scale_range
+
+        assert len(scale_range) == 2
+        assert scale_range[0] < scale_range[1]
+        assert np.all([x > 0 for x in scale_range])
+
+        self.keep_ratio = True
+        self.interpolation = interpolation
+
+    def __call__(self, results):
+        """Performs the Resize augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        short_edge = np.random.randint(self.scale_range[0],
+                                       self.scale_range[1] + 1)
+        resize = Resize((-1, short_edge),
+                        keep_ratio=True,
+                        interpolation=self.interpolation,
+                        lazy=False)
+        results = resize(results)
+
+        results['short_edge'] = short_edge
+        return results
+
+    def __repr__(self):
+        scale_range = self.scale_range
+        repr_str = (f'{self.__class__.__name__}('
+                    f'scale_range=({scale_range[0]}, {scale_range[1]}), '
+                    f'interpolation={self.interpolation})')
+        return repr_str
+
+
+@PIPELINES.register()
+class Rescale:
+    """resize images so that the short_edge is resized to a specific
+    size in a given range. The scale ratio is unchanged after resizing.
+
+    Required keys are "imgs", "img_shape", "modality", added or modified
+    keys are "imgs", "img_shape", "keep_ratio", "scale_factor", "resize_size",
+    "short_edge".
+
+    Args:
+        scale_range (tuple[int]): The range of short edge length. A closed
+            interval.
+        interpolation (str): Algorithm used for interpolation:
+            "nearest" | "bilinear". Default: "bilinear".
+    """
+
+    def __init__(self, scale_range, interpolation='bilinear'):
+        scale_range = eval(scale_range)
+        self.scale_range = scale_range
+
+        self.keep_ratio = True
+        self.interpolation = interpolation
+
+    def __call__(self, results):
+        """Performs the Resize augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        resize = Resize(
+            self.scale_range,
+            keep_ratio=True,
+            interpolation=self.interpolation,
+            lazy=False)
+        results = resize(results)
+        return results
+
+    def __repr__(self):
+        scale_range = self.scale_range
+        repr_str = (f'{self.__class__.__name__}('
+                    f'scale_range=({scale_range[0]}, {scale_range[1]}), '
+                    f'interpolation={self.interpolation})')
+        return repr_str
+
+
+@PIPELINES.register()
+class RandomCrop_v2:
+    """Vanilla square random crop that specifics the output size.
+
+    Required keys in results are "imgs" and "img_shape", added or
+    modified keys are "imgs", "lazy"; Required keys in "lazy" are "flip",
+    "crop_bbox", added or modified key is "crop_bbox".
+
+    Args:
+        size (int): The output size of the images.
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+
+    def __init__(self, size, lazy=False):
+        if not isinstance(size, int):
+            raise TypeError(f'Size must be an int, but got {type(size)}')
+        self.size = size
+        self.lazy = lazy
+
+    def __call__(self, results):
+        """Performs the RandomCrop augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        _init_lazy_if_proper(results, self.lazy)
+
+        img_h, img_w = results['img_shape']
+        assert self.size <= img_h and self.size <= img_w
+
+        y_offset = 0
+        x_offset = 0
+        if img_h > self.size:
+            y_offset = int(np.random.randint(0, img_h - self.size))
+        if img_w > self.size:
+            x_offset = int(np.random.randint(0, img_w - self.size))
+        if 'crop_quadruple' not in results:
+            results['crop_quadruple'] = np.array(
+                [0, 0, 1, 1],  # x, y, w, h
+                dtype=np.float32)
+
+        x_ratio, y_ratio = x_offset / img_w, y_offset / img_h
+        w_ratio, h_ratio = self.size / img_w, self.size / img_h
+
+        old_crop_quadruple = results['crop_quadruple']
+        old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1]
+        old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3]
+        new_crop_quadruple = [
+            old_x_ratio + x_ratio * old_w_ratio,
+            old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio,
+            h_ratio * old_x_ratio
+        ]
+        results['crop_quadruple'] = np.array(
+            new_crop_quadruple, dtype=np.float32)
+
+        new_h, new_w = self.size, self.size
+
+        results['crop_bbox'] = np.array(
+            [x_offset, y_offset, x_offset + new_w, y_offset + new_h])
+        results['img_shape'] = (new_h, new_w)
+
+        if not self.lazy:
+            results['imgs'] = [
+                img[y_offset:y_offset + new_h, x_offset:x_offset + new_w]
+                for img in results['imgs']
+            ]
+        else:
+            lazyop = results['lazy']
+            if lazyop['flip']:
+                raise NotImplementedError('Put Flip at last for now')
+
+            # record crop_bbox in lazyop dict to ensure only crop once in Fuse
+            lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox']
+            left = x_offset * (lazy_right - lazy_left) / img_w
+            right = (x_offset + new_w) * (lazy_right - lazy_left) / img_w
+            top = y_offset * (lazy_bottom - lazy_top) / img_h
+            bottom = (y_offset + new_h) * (lazy_bottom - lazy_top) / img_h
+            lazyop['crop_bbox'] = np.array(
+                [(lazy_left + left), (lazy_top + top), (lazy_left + right),
+                 (lazy_top + bottom)],
+                dtype=np.float32)
+
+        # Process entity boxes
+        if 'gt_bboxes' in results:
+            assert not self.lazy
+            entity_box_crop = EntityBoxCrop(results['crop_bbox'])
+            results = entity_box_crop(results)
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}(size={self.size}, '
+                    f'lazy={self.lazy})')
+        return repr_str
+
+
+def imflip_(img, direction='horizontal'):
+    """Inplace flip an image horizontally or vertically.
+
+    Args:
+        img (ndarray): Image to be flipped.
+        direction (str): The flip direction, either "horizontal" or
+            "vertical" or "diagonal".
+
+    Returns:
+        ndarray: The flipped image (inplace).
+    """
+    assert direction in ['horizontal', 'vertical', 'diagonal']
+    if direction == 'horizontal':
+        return cv2.flip(img, 1, img)
+    elif direction == 'vertical':
+        return cv2.flip(img, 0, img)
+    else:
+        return cv2.flip(img, -1, img)
+
+
+def iminvert(img):
+    """Invert (negate) an image.
+
+    Args:
+        img (ndarray): Image to be inverted.
+
+    Returns:
+        ndarray: The inverted image.
+    """
+    return np.full_like(img, 255) - img
+
+
+@PIPELINES.register()
+class Flip:
+    """Flip the input images with a probability.
+
+    Reverse the order of elements in the given imgs with a specific direction.
+    The shape of the imgs is preserved, but the elements are reordered.
+    Required keys are "imgs", "img_shape", "modality", added or modified
+    keys are "imgs", "lazy" and "flip_direction". Required keys in "lazy" is
+    None, added or modified key are "flip" and "flip_direction". The Flip
+    augmentation should be placed after any cropping / reshaping augmentations,
+    to make sure crop_quadruple is calculated properly.
+
+    Args:
+        flip_ratio (float): Probability of implementing flip. Default: 0.5.
+        direction (str): Flip imgs horizontally or vertically. Options are
+            "horizontal" | "vertical". Default: "horizontal".
+        lazy (bool): Determine whether to apply lazy operation. Default: False.
+    """
+    _directions = ['horizontal', 'vertical']
+
+    def __init__(self, flip_ratio=0.5, direction='horizontal', lazy=False):
+        if direction not in self._directions:
+            raise ValueError(f'Direction {direction} is not supported. '
+                             f'Currently support ones are {self._directions}')
+        self.flip_ratio = flip_ratio
+        self.direction = direction
+        self.lazy = lazy
+
+    def __call__(self, results):
+        """Performs the Flip augmentation.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        _init_lazy_if_proper(results, self.lazy)
+        flip = np.random.rand() < self.flip_ratio
+
+        results['flip'] = flip
+        results['flip_direction'] = self.direction
+
+        if not self.lazy:
+            if flip:
+                for i, img in enumerate(results['imgs']):
+                    imflip_(img, self.direction)
+                lt = len(results['imgs'])
+            else:
+                results['imgs'] = list(results['imgs'])
+        else:
+            lazyop = results['lazy']
+            if lazyop['flip']:
+                raise NotImplementedError('Use one Flip please')
+            lazyop['flip'] = flip
+            lazyop['flip_direction'] = self.direction
+
+        if 'gt_bboxes' in results and flip:
+            assert not self.lazy and self.direction == 'horizontal'
+            entity_box_flip = EntityBoxFlip(results['img_shape'])
+            results = entity_box_flip(results)
+
+        return results
+
+    def __repr__(self):
+        repr_str = (
+            f'{self.__class__.__name__}('
+            f'flip_ratio={self.flip_ratio}, direction={self.direction}, '
+            f'lazy={self.lazy})')
+        return repr_str
+
+
+def imnormalize_(img, mean, std, to_rgb=True):
+    """Inplace normalize an image with mean and std.
+
+    Args:
+        img (ndarray): Image to be normalized.
+        mean (ndarray): The mean to be used for normalize.
+        std (ndarray): The std to be used for normalize.
+        to_rgb (bool): Whether to convert to rgb.
+
+    Returns:
+        ndarray: The normalized image.
+    """
+    # cv2 inplace normalization does not accept uint8
+    assert img.dtype != np.uint8
+    mean = np.float64(mean.reshape(1, -1))
+    stdinv = 1 / np.float64(std.reshape(1, -1))
+    if to_rgb:
+        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)  # inplace
+    cv2.subtract(img, mean, img)  # inplace
+    cv2.multiply(img, stdinv, img)  # inplace
+    return img
+
+
+@PIPELINES.register()
+class Normalize:
+    """Normalize images with the given mean and std value.
+
+    Required keys are "imgs", "img_shape", "modality", added or modified
+    keys are "imgs" and "img_norm_cfg". If modality is 'Flow', additional
+    keys "scale_factor" is required
+
+    Args:
+        mean (Sequence[float]): Mean values of different channels.
+        std (Sequence[float]): Std values of different channels.
+        to_bgr (bool): Whether to convert channels from RGB to BGR.
+            Default: False.
+        adjust_magnitude (bool): Indicate whether to adjust the flow magnitude
+            on 'scale_factor' when modality is 'Flow'. Default: False.
+    """
+
+    def __init__(self, mean, std, to_bgr=False, adjust_magnitude=False):
+        if not isinstance(mean, Sequence):
+            raise TypeError(
+                f'Mean must be list, tuple or np.ndarray, but got {type(mean)}')
+
+        if not isinstance(std, Sequence):
+            raise TypeError(
+                f'Std must be list, tuple or np.ndarray, but got {type(std)}')
+
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.to_bgr = to_bgr
+        self.adjust_magnitude = adjust_magnitude
+
+    def __call__(self, results):
+        n = len(results['imgs'])
+        h, w, c = results['imgs'][0].shape
+        imgs = np.empty((n, h, w, c), dtype=np.float32)
+        for i, img in enumerate(results['imgs']):
+            imgs[i] = img
+
+        for img in imgs:
+            imnormalize_(img, self.mean, self.std, self.to_bgr)
+
+        results['imgs'] = imgs
+        results['img_norm_cfg'] = dict(
+            mean=self.mean, std=self.std, to_bgr=self.to_bgr)
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'mean={self.mean}, '
+                    f'std={self.std}, '
+                    f'to_bgr={self.to_bgr}, '
+                    f'adjust_magnitude={self.adjust_magnitude})')
+        return repr_str
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/compose.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/compose.py
@ -0,0 +1,76 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Sequence
+from ..registry import PIPELINES
+import traceback
+from ...utils import build
+from ...utils import get_logger
+
+
+@PIPELINES.register()
+class Compose(object):
+    """
+    Composes several pipelines(include decode func, sample func, and transforms) together.
+
+    Note: To deal with ```list``` type cfg temporaray, like:
+
+        transform:
+            - Crop: # A list
+                attribute: 10
+            - Resize: # A list
+                attribute: 20
+
+    every key of list will pass as the key name to build a module.
+    XXX: will be improved in the future.
+
+    Args:
+        pipelines (list): List of transforms to compose.
+    Returns:
+        A compose object which is callable, __call__ for this Compose
+        object will call each given :attr:`transforms` sequencely.
+    """
+    def __init__(self, pipelines):
+        #assert isinstance(pipelines, Sequence)
+        self.pipelines = []
+        for p in pipelines.values():
+            if isinstance(p, dict):
+                p = build(p, PIPELINES)
+                self.pipelines.append(p)
+            elif isinstance(p, list):
+                for t in p:
+                    #XXX: to deal with old format cfg, ugly code here!
+                    temp_dict = dict(name=list(t.keys())[0])
+                    for all_sub_t in t.values():
+                        if all_sub_t is not None:
+                            temp_dict.update(all_sub_t) 
+      
+                    t = build(temp_dict, PIPELINES)
+                    self.pipelines.append(t)
+            elif callable(p):
+                self.pipelines.append(p)
+            else:
+                raise TypeError(f'pipelines must be callable or a dict,'
+                                f'but got {type(p)}')
+    def __call__(self, data):
+        for p in self.pipelines:
+            try:
+                data = p(data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger = get_logger("paddlevideo")
+                logger.info("fail to perform transform [{}] with error: "
+                      "{} and stack:\n{}".format(p, e, str(stack_info)))
+                raise e
+        return data
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode.py
@ -0,0 +1,348 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+try:
+    import av
+except ImportError as e:
+    print(
+        f"Warning! {e}, [av] package and it's dependencies is required for TimeSformer and other models."
+    )
+import cv2
+import pickle
+import decord as de
+import math
+import random
+from ..registry import PIPELINES
+
+
+def get_start_end_idx(video_size, clip_size, clip_idx, num_clips):
+    delta = max(video_size - clip_size, 0)
+    if clip_idx == -1:  # here
+        # Random temporal sampling.
+        start_idx = random.uniform(0, delta)
+    else:  # ignore
+        # Uniformly sample the clip with the given index.
+        start_idx = delta * clip_idx / num_clips
+    end_idx = start_idx + clip_size - 1
+    return start_idx, end_idx
+
+
+@PIPELINES.register()
+class VideoDecoder(object):
+    """
+    Decode mp4 file to frames.
+    Args:
+        filepath: the file path of mp4 file
+    """
+    def __init__(self,
+                 backend='cv2',
+                 mode='train',
+                 sampling_rate=32,
+                 num_seg=8,
+                 num_clips=1,
+                 target_fps=30):
+
+        self.backend = backend
+        # params below only for TimeSformer
+        self.mode = mode
+        self.sampling_rate = sampling_rate
+        self.num_seg = num_seg
+        self.num_clips = num_clips
+        self.target_fps = target_fps
+
+    def __call__(self, results):
+        """
+        Perform mp4 decode operations.
+        return:
+            List where each item is a numpy array after decoder.
+        """
+        file_path = results['filename']
+        results['format'] = 'video'
+        results['backend'] = self.backend
+
+        if self.backend == 'cv2':
+            cap = cv2.VideoCapture(file_path)
+            videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            sampledFrames = []
+            for i in range(videolen):
+                ret, frame = cap.read()
+                # maybe first frame is empty
+                if ret == False:
+                    continue
+                img = frame[:, :, ::-1]
+                sampledFrames.append(img)
+            results['frames'] = sampledFrames
+            results['frames_len'] = len(sampledFrames)
+
+        elif self.backend == 'decord':
+            container = de.VideoReader(file_path)
+            frames_len = len(container)
+            results['frames'] = container
+            results['frames_len'] = frames_len
+
+        elif self.backend == 'pyav':  # for TimeSformer
+            if self.mode in ["train", "valid"]:
+                clip_idx = -1
+            elif self.mode in ["test"]:
+                clip_idx = 0
+            else:
+                raise NotImplementedError
+
+            container = av.open(file_path)
+
+            num_clips = 1  # always be 1
+
+            # decode process
+            fps = float(container.streams.video[0].average_rate)
+
+            frames_length = container.streams.video[0].frames
+            duration = container.streams.video[0].duration
+
+            if duration is None:
+                # If failed to fetch the decoding information, decode the entire video.
+                decode_all_video = True
+                video_start_pts, video_end_pts = 0, math.inf
+            else:
+                decode_all_video = False
+                start_idx, end_idx = get_start_end_idx(
+                    frames_length,
+                    self.sampling_rate * self.num_seg / self.target_fps * fps,
+                    clip_idx, num_clips)
+                timebase = duration / frames_length
+                video_start_pts = int(start_idx * timebase)
+                video_end_pts = int(end_idx * timebase)
+
+            frames = None
+            # If video stream was found, fetch video frames from the video.
+            if container.streams.video:
+                margin = 1024
+                seek_offset = max(video_start_pts - margin, 0)
+
+                container.seek(seek_offset,
+                               any_frame=False,
+                               backward=True,
+                               stream=container.streams.video[0])
+                tmp_frames = {}
+                buffer_count = 0
+                max_pts = 0
+                for frame in container.decode(**{"video": 0}):
+                    max_pts = max(max_pts, frame.pts)
+                    if frame.pts < video_start_pts:
+                        continue
+                    if frame.pts <= video_end_pts:
+                        tmp_frames[frame.pts] = frame
+                    else:
+                        buffer_count += 1
+                        tmp_frames[frame.pts] = frame
+                        if buffer_count >= 0:
+                            break
+                video_frames = [tmp_frames[pts] for pts in sorted(tmp_frames)]
+
+                container.close()
+
+                frames = [frame.to_rgb().to_ndarray() for frame in video_frames]
+                clip_sz = self.sampling_rate * self.num_seg / self.target_fps * fps
+
+                start_idx, end_idx = get_start_end_idx(
+                    len(frames),  # frame_len
+                    clip_sz,
+                    clip_idx if decode_all_video else
+                    0,  # If decode all video, -1 in train and valid, 0 in test;
+                    # else, always 0 in train, valid and test, as we has selected clip size frames when decode.
+                    1)
+                results['frames'] = frames
+                results['frames_len'] = len(frames)
+                results['start_idx'] = start_idx
+                results['end_idx'] = end_idx
+        else:
+            raise NotImplementedError
+            # pass
+        return results
+
+
+@PIPELINES.register()
+class FrameDecoder(object):
+    """just parse results
+    """
+    def __init__(self):
+        pass
+
+    def __call__(self, results):
+        results['format'] = 'frame'
+        return results
+
+
+@PIPELINES.register()
+class MRIDecoder(object):
+    """just parse results
+    """
+    def __init__(self):
+        pass
+
+    def __call__(self, results):
+        results['format'] = 'MRI'
+        return results
+
+
+@PIPELINES.register()
+class FeatureDecoder(object):
+    """
+        Perform feature decode operations.e.g.youtube8m
+    """
+    def __init__(self, num_classes, max_len=512, has_label=True):
+        self.max_len = max_len
+        self.num_classes = num_classes
+        self.has_label = has_label
+
+    def __call__(self, results):
+        """
+        Perform feature decode operations.
+        return:
+            List where each item is a numpy array after decoder.
+        """
+        #1. load pkl
+        #2. parse to rgb/audio/
+        #3. padding
+
+        filepath = results['filename']
+        data = pickle.load(open(filepath, 'rb'), encoding='bytes')
+
+        record = data
+        nframes = record['nframes'] if 'nframes' in record else record[
+            b'nframes']
+        rgb = record['feature'].astype(
+            float) if 'feature' in record else record[b'feature'].astype(float)
+        audio = record['audio'].astype(
+            float) if 'audio' in record else record[b'audio'].astype(float)
+        if self.has_label:
+            label = record['label'] if 'label' in record else record[b'label']
+            one_hot_label = self.make_one_hot(label, self.num_classes)
+
+        rgb = rgb[0:nframes, :]
+        audio = audio[0:nframes, :]
+
+        rgb = self.dequantize(rgb,
+                              max_quantized_value=2.,
+                              min_quantized_value=-2.)
+        audio = self.dequantize(audio,
+                                max_quantized_value=2,
+                                min_quantized_value=-2)
+
+        if self.has_label:
+            results['labels'] = one_hot_label.astype("float32")
+
+        feat_pad_list = []
+        feat_len_list = []
+        mask_list = []
+        vitem = [rgb, audio]
+        for vi in range(2):  #rgb and audio
+            if vi == 0:
+                prefix = "rgb_"
+            else:
+                prefix = "audio_"
+            feat = vitem[vi]
+            results[prefix + 'len'] = feat.shape[0]
+            #feat pad step 1. padding
+            feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]),
+                                dtype=np.float32)
+            feat_pad = np.concatenate((feat, feat_add), axis=0)
+            results[prefix + 'data'] = feat_pad.astype("float32")
+            #feat pad step 2. mask
+            feat_mask_origin = np.ones(feat.shape, dtype=np.float32)
+            feat_mask_add = feat_add
+            feat_mask = np.concatenate((feat_mask_origin, feat_mask_add),
+                                       axis=0)
+            results[prefix + 'mask'] = feat_mask.astype("float32")
+
+        return results
+
+    def dequantize(self,
+                   feat_vector,
+                   max_quantized_value=2.,
+                   min_quantized_value=-2.):
+        """
+        Dequantize the feature from the byte format to the float format
+        """
+
+        assert max_quantized_value > min_quantized_value
+        quantized_range = max_quantized_value - min_quantized_value
+        scalar = quantized_range / 255.0
+        bias = (quantized_range / 512.0) + min_quantized_value
+
+        return feat_vector * scalar + bias
+
+    def make_one_hot(self, label, dim=3862):
+        one_hot_label = np.zeros(dim)
+        one_hot_label = one_hot_label.astype(float)
+        for ind in label:
+            one_hot_label[int(ind)] = 1
+        return one_hot_label
+
+
+@PIPELINES.register()
+class ActionFeatureDecoder(object):
+    """
+        Perform feature decode operations on footballaction
+    """
+    def __init__(self, num_classes, max_len=512, has_label=True):
+        self.max_len = max_len
+        self.num_classes = num_classes
+        self.has_label = has_label
+
+    def __call__(self, results):
+        """
+        Perform feature decode operations.
+        return:
+            List where each item is a numpy array after decoder.
+        """
+        #1. load pkl
+        #2. parse to rgb/audio/
+        #3. padding
+
+        filepath = results['filename']
+        data = pickle.load(open(filepath, 'rb'), encoding='bytes')
+
+        pkl_data = data
+        rgb = pkl_data['image_feature'].astype(float)
+        audio = pkl_data['audio_feature'].astype(float)
+        label_id_info = pkl_data['label_info']
+        label_cls = [label_id_info['label']]
+        label_one = int(label_cls[0])
+        if len(label_cls) > 1:
+            label_index = random.randint(0, 1)
+            label_one = int(label_cls[label_index])
+        iou_norm = float(label_id_info['norm_iou'])
+        results['labels'] = np.array([label_one])
+        results['iou_norm'] = float(iou_norm)
+
+        vitem = [rgb, audio]
+        for vi in range(2):  #rgb and audio
+            if vi == 0:
+                prefix = "rgb_"
+            else:
+                prefix = "audio_"
+            feat = vitem[vi]
+            results[prefix + 'len'] = feat.shape[0]
+            #feat pad step 1. padding
+            feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]),
+                                dtype=np.float32)
+            feat_pad = np.concatenate((feat, feat_add), axis=0)
+            results[prefix + 'data'] = feat_pad.astype("float32")
+            #feat pad step 2. mask
+            feat_mask_origin = np.ones(feat.shape, dtype=np.float32)
+            feat_mask = np.concatenate((feat_mask_origin, feat_add), axis=0)
+            results[prefix + 'mask'] = feat_mask.astype("float32")
+
+        return results
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_image.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_image.py
@ -0,0 +1,206 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+import PIL.Image as pil
+
+try:
+    import skimage.transform
+except ImportError as e:
+    print(
+        f"Warning! {e}, [scikit-image] package and it's dependencies is required for ADDS."
+    )
+from PIL import Image
+
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class ImageDecoder(object):
+    """Decode Image
+    """
+    def __init__(self,
+                 dataset,
+                 frame_idxs,
+                 num_scales,
+                 side_map,
+                 full_res_shape,
+                 img_ext,
+                 backend='cv2'):
+        self.backend = backend
+        self.dataset = dataset
+        self.frame_idxs = frame_idxs
+        self.num_scales = num_scales
+        self.side_map = side_map
+        self.full_res_shape = full_res_shape
+        self.img_ext = img_ext
+
+    def _pil_loader(self, path):
+        with open(path, 'rb') as f:
+            with Image.open(f) as img:
+                return img.convert('RGB')
+
+    def get_color(self, folder, frame_index, side):
+        color = self._pil_loader(
+            self.get_image_path(self.dataset, folder, frame_index, side))
+        return color
+
+    def get_image_path(self, dataset, folder, frame_index, side):
+        if dataset == "kitti":
+            f_str = "{:010d}{}".format(frame_index, self.img_ext)
+            image_path = os.path.join(self.data_path, folder, f_str)
+        elif dataset == "kitti_odom":
+            f_str = "{:06d}{}".format(frame_index, self.img_ext)
+            image_path = os.path.join(self.data_path,
+                                      "sequences/{:02d}".format(int(folder)),
+                                      "image_{}".format(self.side_map[side]),
+                                      f_str)
+        elif dataset == "kitti_depth":
+            f_str = "{:010d}{}".format(frame_index, self.img_ext)
+            image_path = os.path.join(
+                self.data_path, folder,
+                "image_0{}/data".format(self.side_map[side]), f_str)
+
+        return image_path
+
+    def get_depth(self, dataset, folder, frame_index, side):
+        if dataset == "kitii_depth":
+            f_str = "{:010d}.png".format(frame_index)
+            depth_path = os.path.join(
+                self.data_path, folder,
+                "proj_depth/groundtruth/image_0{}".format(self.side_map[side]),
+                f_str)
+
+            depth_gt = pil.open(depth_path)
+            depth_gt = depth_gt.resize(self.full_res_shape, pil.NEAREST)
+            depth_gt = np.array(depth_gt).astype(np.float32) / 256
+
+        else:
+            f_str = "{:010d}{}".format(frame_index, self.img_ext)
+            depth_path = os.path.join(self.data_path, folder + '_gt', f_str)
+
+            img_file = Image.open(depth_path)
+            depth_png = np.array(img_file, dtype=int)
+            img_file.close()
+            # make sure we have a proper 16bit depth map here.. not 8bit!
+            assert np.max(depth_png) > 255, \
+                "np.max(depth_png)={}, path={}".format(np.max(depth_png), depth_path)
+
+            depth_gt = depth_png.astype(np.float) / 256.
+
+            depth_gt = depth_gt[160:960 - 160, :]
+
+            depth_gt = skimage.transform.resize(depth_gt,
+                                                self.full_res_shape[::-1],
+                                                order=0,
+                                                preserve_range=True,
+                                                mode='constant')
+
+        return depth_gt
+
+    def __call__(self, results):
+        """
+        Perform mp4 decode operations.
+        return:
+            List where each item is a numpy array after decoder.
+        """
+        if results.get('mode', None) == 'infer':
+            imgs = {}
+            imgs[("color", 0,
+                  -1)] = Image.open(results["filename"]).convert("RGB")
+            results['imgs'] = imgs
+            return results
+
+        self.data_path = results['data_path']
+        results['backend'] = self.backend
+
+        imgs = {}
+
+        results['frame_idxs'] = self.frame_idxs
+        results['num_scales'] = self.num_scales
+
+        file_name = results['filename']
+        folder = results['folder']
+        frame_index = results['frame_index']
+        line = file_name.split('/')
+        istrain = folder.split('_')[1]
+        if 'mode' not in results:
+            results['mode'] = istrain
+        results['day_or_night'] = folder.split('_')[0]
+
+        if istrain == "train":
+            if folder[0] == 'd':
+                folder2 = folder + '_fake_night'
+                flag = 0
+            else:
+                folder2 = folder + '_fake_day'
+                tmp = folder
+                folder = folder2
+                folder2 = tmp
+                flag = 1
+
+            if len(line) == 3:
+                side = line[2]
+            else:
+                side = None
+
+            results['side'] = side
+
+            for i in self.frame_idxs:
+
+                if i == "s":
+                    other_side = {"r": "l", "l": "r"}[side]
+                    imgs[("color", i,
+                          -1)] = self.get_color(folder, frame_index, other_side)
+                    imgs[("color_n", i,
+                          -1)] = self.get_color(folder2, frame_index,
+                                                other_side)
+                else:
+                    imgs[("color", i,
+                          -1)] = self.get_color(folder, frame_index + i, side)
+                    imgs[("color_n", i,
+                          -1)] = self.get_color(folder2, frame_index + i, side)
+
+            istrain = folder.split('_')[1]
+            if istrain != 'train':
+                if flag:
+                    depth_gt = self.get_depth(folder2, frame_index, side)
+                else:
+                    depth_gt = self.get_depth(folder, frame_index, side)
+                imgs["depth_gt"] = np.expand_dims(depth_gt, 0)
+        elif istrain == 'val':
+            if len(line) == 3:
+                side = line[2]
+            else:
+                side = None
+
+            for i in self.frame_idxs:
+                if i == "s":
+                    other_side = {"r": "l", "l": "r"}[side]
+                    imgs[("color", i,
+                          -1)] = self.get_color(folder, frame_index, other_side)
+                else:
+
+                    imgs[("color", i,
+                          -1)] = self.get_color(folder, frame_index + i, side)
+
+            # adjusting intrinsics to match each scale in the pyramid
+
+            depth_gt = self.get_depth(self.dataset, folder, frame_index, side)
+            imgs["depth_gt"] = np.expand_dims(depth_gt, 0)
+        results['imgs'] = imgs
+
+        return results
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_sampler.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_sampler.py
@ -0,0 +1,93 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import numpy as np
+from PIL import Image
+import decord as de
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class DecodeSampler(object):
+    """
+    We use 'decord' for decode and sampling, which is faster than opencv.
+    This is used in slowfast model.
+    Args:
+        num_frames(int): the number of frames we want to sample.
+        sampling_rate(int): sampling rate for video data.
+        target_fps(int): desired fps, default 30
+        test_mode(bool): whether test or train/valid. In slowfast, we use multicrop when test.
+    """
+    def __init__(self,
+                 num_frames,
+                 sampling_rate,
+                 default_sampling_rate=2,
+                 target_fps=30,
+                 test_mode=False):
+        self.num_frames = num_frames
+        self.orig_sampling_rate = self.sampling_rate = sampling_rate
+        self.default_sampling_rate = default_sampling_rate
+        self.target_fps = target_fps
+        self.test_mode = test_mode
+
+    def get_start_end_idx(self, video_size, clip_size, clip_idx,
+                          temporal_num_clips):
+        delta = max(video_size - clip_size, 0)
+        if not self.test_mode:
+            # Random temporal sampling.
+            start_idx = random.uniform(0, delta)
+        else:
+            # Uniformly sample the clip with the given index.
+            start_idx = delta * clip_idx / temporal_num_clips
+        end_idx = start_idx + clip_size - 1
+        return start_idx, end_idx
+
+    def __call__(self, results):
+        """
+        Perform mp4 decode operations.
+        return:
+            List where each item is a numpy array after decoder.
+        """
+        short_cycle_idx = results.get('short_cycle_idx')
+        if short_cycle_idx:
+            self.sampling_rate = random.randint(self.default_sampling_rate,
+                                                self.orig_sampling_rate)
+
+        filepath = results['filename']
+        temporal_sample_index = results['temporal_sample_index']
+        temporal_num_clips = results['temporal_num_clips']
+
+        vr = de.VideoReader(filepath)
+        videolen = len(vr)
+
+        # fps = vr.get_avg_fps()
+        clip_size = self.num_frames * self.sampling_rate * fps / self.target_fps
+
+        start_idx, end_idx = self.get_start_end_idx(videolen, clip_size,
+                                                    temporal_sample_index,
+                                                    temporal_num_clips)
+        index = np.linspace(start_idx, end_idx, self.num_frames).astype("int64")
+        index = np.clip(index, 0, videolen)
+
+        frames_select = vr.get_batch(index)  #1 for buffer
+
+        # dearray_to_img
+        np_frames = frames_select.asnumpy()
+        frames_select_list = []
+        for i in range(np_frames.shape[0]):
+            imgbuf = np_frames[i]
+            frames_select_list.append(Image.fromarray(imgbuf, mode='RGB'))
+        results['imgs'] = frames_select_list
+        return results
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_sampler_MRI.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_sampler_MRI.py
@ -0,0 +1,224 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+
+import numpy as np
+from PIL import Image
+try:
+    import SimpleITK as sitk
+except ImportError as e:
+    print(
+        f"Warning! {e}, [SimpleITK] package and it's dependencies is required for PP-Care."
+    )
+import cv2
+
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class SFMRI_DecodeSampler(object):
+    """
+    Sample frames id.
+    NOTE: Use PIL to read image here, has diff with CV2
+    Args:
+        num_seg(int): number of segments.
+        seg_len(int): number of sampled frames in each segment.
+        valid_mode(bool): True or False.
+        select_left: Whether to select the frame to the left in the middle when the sampling interval is even in the test mode.
+    Returns:
+        frames_idx: the index of sampled #frames.
+    """
+    def __init__(self,
+                 num_seg,
+                 seg_len,
+                 valid_mode=False,
+                 select_left=False,
+                 dense_sample=False,
+                 linspace_sample=False):
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.valid_mode = valid_mode
+        self.select_left = select_left
+        self.dense_sample = dense_sample
+        self.linspace_sample = linspace_sample
+
+    def _get(self, frames_idx_s, frames_idx_f, results):
+
+        frame_dir = results['frame_dir']
+        imgs_s = []
+        imgs_f = []
+        MRI = sitk.GetArrayFromImage(sitk.ReadImage(frame_dir))
+        for idx in frames_idx_s:
+            item = MRI[idx]
+            item = cv2.resize(item, (224, 224))
+            imgs_s.append(item)
+
+        for idx in frames_idx_f:
+            item = MRI[idx]
+            item = cv2.resize(item, (224, 224))
+            imgs_f.append(item)
+
+        results['imgs'] = [imgs_s, imgs_f]
+        return results
+
+    def __call__(self, results):
+        """
+        Args:
+            frames_len: length of frames.
+        return:
+            sampling id.
+        """
+        frames_len = int(results['frames_len'])
+        average_dur1 = int(frames_len / self.num_seg[0])
+        average_dur2 = int(frames_len / self.num_seg[1])
+        frames_idx_s = []
+        frames_idx_f = []
+        if self.linspace_sample:
+            if 'start_idx' in results and 'end_idx' in results:
+                offsets_s = np.linspace(results['start_idx'],
+                                        results['end_idx'], self.num_seg[0])
+                offsets_f = np.linspace(results['start_idx'],
+                                        results['end_idx'], self.num_seg[1])
+            else:
+                offsets_s = np.linspace(0, frames_len - 1, self.num_seg[0])
+                offsets_f = np.linspace(0, frames_len - 1, self.num_seg[1])
+            offsets_s = np.clip(offsets_s, 0, frames_len - 1).astype(np.int64)
+            offsets_f = np.clip(offsets_f, 0, frames_len - 1).astype(np.int64)
+
+            frames_idx_s = list(offsets_s)
+            frames_idx_f = list(offsets_f)
+
+            return self._get(frames_idx_s, frames_idx_f, results)
+
+        if not self.select_left:
+            if self.dense_sample:  # For ppTSM
+                if not self.valid_mode:  # train
+                    sample_pos = max(1, 1 + frames_len - 64)
+                    t_stride1 = 64 // self.num_seg[0]
+                    t_stride2 = 64 // self.num_seg[1]
+                    start_idx = 0 if sample_pos == 1 else np.random.randint(
+                        0, sample_pos - 1)
+                    offsets_s = [(idx * t_stride1 + start_idx) % frames_len + 1
+                                 for idx in range(self.num_seg[0])]
+                    offsets_f = [(idx * t_stride2 + start_idx) % frames_len + 1
+                                 for idx in range(self.num_seg[1])]
+                    frames_idx_s = offsets_s
+                    frames_idx_f = offsets_f
+                else:
+                    sample_pos = max(1, 1 + frames_len - 64)
+                    t_stride1 = 64 // self.num_seg[0]
+                    t_stride2 = 64 // self.num_seg[1]
+                    start_list = np.linspace(0,
+                                             sample_pos - 1,
+                                             num=10,
+                                             dtype=int)
+                    offsets_s = []
+                    offsets_f = []
+                    for start_idx in start_list.tolist():
+                        offsets_s += [
+                            (idx * t_stride1 + start_idx) % frames_len + 1
+                            for idx in range(self.num_seg[0])
+                        ]
+                    for start_idx in start_list.tolist():
+                        offsets_f += [
+                            (idx * t_stride2 + start_idx) % frames_len + 1
+                            for idx in range(self.num_seg[1])
+                        ]
+                    frames_idx_s = offsets_s
+                    frames_idx_f = offsets_f
+            else:
+                for i in range(self.num_seg[0]):
+                    idx = 0
+                    if not self.valid_mode:
+                        if average_dur1 >= self.seg_len:
+                            idx = random.randint(0, average_dur1 - self.seg_len)
+                            idx += i * average_dur1
+                        elif average_dur1 >= 1:
+                            idx += i * average_dur1
+                        else:
+                            idx = i
+                    else:
+                        if average_dur1 >= self.seg_len:
+                            idx = (average_dur1 - 1) // 2
+                            idx += i * average_dur1
+                        elif average_dur1 >= 1:
+                            idx += i * average_dur1
+                        else:
+                            idx = i
+                    for jj in range(idx, idx + self.seg_len):
+                        frames_idx_s.append(jj)
+
+                for i in range(self.num_seg[1]):
+                    idx = 0
+                    if not self.valid_mode:
+                        if average_dur2 >= self.seg_len:
+                            idx = random.randint(0, average_dur2 - self.seg_len)
+                            idx += i * average_dur2
+                        elif average_dur2 >= 1:
+                            idx += i * average_dur2
+                        else:
+                            idx = i
+                    else:
+                        if average_dur2 >= self.seg_len:
+                            idx = (average_dur2 - 1) // 2
+                            idx += i * average_dur2
+                        elif average_dur2 >= 1:
+                            idx += i * average_dur2
+                        else:
+                            idx = i
+                    for jj in range(idx, idx + self.seg_len):
+                        frames_idx_f.append(jj)
+
+            return self._get(frames_idx_s, frames_idx_f, results)
+
+        else:  # for TSM
+            if not self.valid_mode:
+                if average_dur2 > 0:
+                    offsets_s = np.multiply(list(range(
+                        self.num_seg[0])), average_dur1) + np.random.randint(
+                            average_dur1, size=self.num_seg[0])
+
+                    offsets_f = np.multiply(list(range(
+                        self.num_seg[1])), average_dur2) + np.random.randint(
+                            average_dur2, size=self.num_seg[1])
+                elif frames_len > self.num_seg[1]:
+                    offsets_s = np.sort(
+                        np.random.randint(frames_len, size=self.num_seg[0]))
+                    offsets_f = np.sort(
+                        np.random.randint(frames_len, size=self.num_seg[1]))
+                else:
+                    offsets_s = np.zeros(shape=(self.num_seg[0], ))
+                    offsets_f = np.zeros(shape=(self.num_seg[1], ))
+            else:
+                if frames_len > self.num_seg[1]:
+                    average_dur_float_s = frames_len / self.num_seg[0]
+                    offsets_s = np.array([
+                        int(average_dur_float_s / 2.0 + average_dur_float_s * x)
+                        for x in range(self.num_seg[0])
+                    ])
+                    average_dur_float_f = frames_len / self.num_seg[1]
+                    offsets_f = np.array([
+                        int(average_dur_float_f / 2.0 + average_dur_float_f * x)
+                        for x in range(self.num_seg[1])
+                    ])
+                else:
+                    offsets_s = np.zeros(shape=(self.num_seg[0], ))
+                    offsets_f = np.zeros(shape=(self.num_seg[1], ))
+
+            frames_idx_s = list(offsets_s)
+            frames_idx_f = list(offsets_f)
+
+            return self._get(frames_idx_s, frames_idx_f, results)
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/mix.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/mix.py
@ -0,0 +1,116 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class Mixup(object):
+    """
+    Mixup operator.
+    Args:
+        alpha(float): alpha value.
+    """
+    def __init__(self, alpha=0.2):
+        assert alpha > 0., \
+                'parameter alpha[%f] should > 0.0' % (alpha)
+        self.alpha = alpha
+
+    def __call__(self, batch):
+        imgs, labels = list(zip(*batch))
+        imgs = np.array(imgs)
+        labels = np.array(labels)
+        bs = len(batch)
+        idx = np.random.permutation(bs)
+        lam = np.random.beta(self.alpha, self.alpha)
+        lams = np.array([lam] * bs, dtype=np.float32)
+        imgs = lam * imgs + (1 - lam) * imgs[idx]
+        return list(zip(imgs, labels, labels[idx], lams))
+
+
+@PIPELINES.register()
+class Cutmix(object):
+    """ Cutmix operator
+    Args:
+        alpha(float): alpha value.
+    """
+    def __init__(self, alpha=0.2):
+        assert alpha > 0., \
+                'parameter alpha[%f] should > 0.0' % (alpha)
+        self.alpha = alpha
+
+    def rand_bbox(self, size, lam):
+        """ rand_bbox """
+        w = size[2]
+        h = size[3]
+        cut_rat = np.sqrt(1. - lam)
+        cut_w = np.int(w * cut_rat)
+        cut_h = np.int(h * cut_rat)
+
+        # uniform
+        cx = np.random.randint(w)
+        cy = np.random.randint(h)
+
+        bbx1 = np.clip(cx - cut_w // 2, 0, w)
+        bby1 = np.clip(cy - cut_h // 2, 0, h)
+        bbx2 = np.clip(cx + cut_w // 2, 0, w)
+        bby2 = np.clip(cy + cut_h // 2, 0, h)
+
+        return bbx1, bby1, bbx2, bby2
+
+    def __call__(self, batch):
+        imgs, labels = list(zip(*batch))
+        imgs = np.array(imgs)
+        labels = np.array(labels)
+
+        bs = len(batch)
+        idx = np.random.permutation(bs)
+        lam = np.random.beta(self.alpha, self.alpha)
+
+        bbx1, bby1, bbx2, bby2 = self.rand_bbox(imgs.shape, lam)
+        imgs[:, :, bbx1:bbx2, bby1:bby2] = imgs[idx, :, bbx1:bbx2, bby1:bby2]
+        lam = 1 - (float(bbx2 - bbx1) * (bby2 - bby1) /
+                   (imgs.shape[-2] * imgs.shape[-1]))
+        lams = np.array([lam] * bs, dtype=np.float32)
+
+        return list(zip(imgs, labels, labels[idx], lams))
+
+
+@PIPELINES.register()
+class VideoMix(object):
+    """
+    VideoMix operator.
+    Args:
+        cutmix_prob(float): prob choose cutmix
+        mixup_alpha(float): alpha for mixup aug
+        cutmix_alpha(float): alpha for cutmix aug
+    """
+    def __init__(self, cutmix_prob=0.5, mixup_alpha=0.2, cutmix_alpha=1.0):
+        assert cutmix_prob > 0., \
+                'parameter cutmix_prob[%f] should > 0.0' % (cutmix_prob)
+        assert mixup_alpha > 0., \
+                'parameter mixup_alpha[%f] should > 0.0' % (mixup_alpha)
+        assert cutmix_alpha > 0., \
+                'parameter cutmix_alpha[%f] should > 0.0' % (cutmix_alpha)
+        self.cutmix_prob = cutmix_prob
+        self.mixup = Mixup(mixup_alpha)
+        self.cutmix = Cutmix(cutmix_alpha)
+
+    def __call__(self, batch):
+        if np.random.random() < self.cutmix_prob:
+            return self.cutmix(batch)
+        else:
+            return self.mixup(batch)
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/multimodal.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/multimodal.py
@ -0,0 +1,380 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import numpy as np
+from PIL import Image
+# import decord as de
+import copy
+import json
+from ..registry import PIPELINES
+
+try:
+    from paddlenlp.transformers import BertTokenizer
+except ImportError as e:
+    print(
+        f"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT."
+    )
+
+
+@PIPELINES.register()
+class FeaturePadding(object):
+    """
+    Padding feature to target shape.
+    """
+    def __init__(self, max_region_num=36, max_action_num=5):
+        self.max_region_num = max_region_num
+        self.max_action_num = max_action_num
+
+    def __call__(self, results):
+        """
+        Padding feature.
+        """
+        pack_feature = results['feature']
+        tokenizer = results['tokenizer']
+        image_feature_wp, image_target_wp, image_location_wp, \
+                num_boxes,  image_h, image_w, image_id, caption, \
+                action_feature_wp, action_target_wp, num_actions = pack_feature
+
+        image_feature = np.zeros((self.max_region_num, 2048), dtype=np.float32)
+        image_target = np.zeros((self.max_region_num, 1601), dtype=np.float32)
+        image_location = np.zeros((self.max_region_num, 5), dtype=np.float32)
+
+        action_feature = np.zeros((self.max_action_num, 2048), dtype=np.float32)
+        action_target = np.zeros((self.max_action_num, ), dtype=np.int64)
+
+        num_boxes = int(num_boxes)
+        image_feature[:num_boxes] = image_feature_wp
+        image_target[:num_boxes] = image_target_wp
+        image_location[:num_boxes, :4] = image_location_wp
+
+        image_location[:, 4] = (image_location[:, 3] - image_location[:, 1]) * (
+            image_location[:, 2] - image_location[:, 0]) / (float(image_w) *
+                                                            float(image_h))
+
+        image_location[:, 0] = image_location[:, 0] / float(image_w)
+        image_location[:, 1] = image_location[:, 1] / float(image_h)
+        image_location[:, 2] = image_location[:, 2] / float(image_w)
+        image_location[:, 3] = image_location[:, 3] / float(image_h)
+
+        image_feature = copy.deepcopy(image_feature)
+        image_target = copy.deepcopy(image_target)
+
+        num_actions = int(num_actions)
+        action_feature[:num_actions] = action_feature_wp
+        action_target[:num_actions] = action_target_wp
+        action_feature = copy.deepcopy(action_feature)
+        action_target = copy.deepcopy(action_target)
+
+        results = dict(image_feat=image_feature,
+                       image_target=image_target,
+                       caption=caption,
+                       image_loc=image_location,
+                       num_boxes=int(num_boxes),
+                       action_feat=action_feature,
+                       action_target=action_target,
+                       num_actions=int(num_actions),
+                       tokenizer=tokenizer)
+        return results
+
+
+@PIPELINES.register()
+class RandomCap(object):
+    def __init__(self, caption_path):
+        """
+        Random Caption for NSP task
+        """
+        self.caption_path = caption_path
+
+    def select_caption(self, caption):
+        captions = caption.split('!')
+        rind = random.randint(0, len(captions) - 1)
+        caption = captions[rind]
+        return caption
+
+    def get_random_caption(self, all_captions):
+        num_caps = len(all_captions)
+        rand_doc_idx = random.randint(0, num_caps - 1)
+        caption = all_captions[rand_doc_idx]
+        caption = self.select_caption(caption)
+        return caption
+
+    def random_cap(self, caption, all_captions):
+        if random.random() > 0.5:
+            label = 0
+        else:
+            caption = self.get_random_caption(all_captions)
+            label = 1
+        return caption, label
+
+    def __call__(self, results):
+        caption = results['caption']
+        all_captions = list(json.load(open(self.caption_path, 'r')))
+        caption = self.select_caption(caption)
+        caption, label = self.random_cap(caption, all_captions)
+        results['caption'] = caption
+        results['is_next'] = label
+        return results
+
+
+@PIPELINES.register()
+class Tokenize(object):
+    def __init__(self, ):
+        """
+        Tokenize caption
+        """
+        pass
+
+    def __call__(self, results):
+        caption = results['caption']
+        tokenizer = results['tokenizer']
+        tokens_caption = tokenizer.tokenize(caption)
+        results['caption'] = tokens_caption
+        return results
+
+
+@PIPELINES.register()
+class RandomMask(object):
+    def __init__(self,
+                 max_seq_length=36,
+                 max_action_length=5,
+                 max_region_length=36):
+        self.max_seq_length = max_seq_length
+        self.max_action_length = max_action_length
+        self.max_region_length = max_region_length
+
+    def get_image_global_feature(self, image_feat, image_loc, image_mask):
+        g_image_feat = np.sum(image_feat, axis=0) / np.sum(
+            image_mask, axis=0, keepdims=True)
+        image_feat = np.concatenate(
+            [np.expand_dims(g_image_feat, axis=0), image_feat],
+            axis=0).astype("float32")
+
+        g_image_loc = np.array([0, 0, 1, 1, 1]).astype("float32")
+        image_loc = np.concatenate(
+            [np.expand_dims(g_image_loc, axis=0), image_loc], axis=0)
+
+        g_image_mask = np.array([1])
+        image_mask = np.concatenate([g_image_mask, image_mask], axis=0)
+
+        return image_feat, image_loc, image_mask
+
+    def _truncate_seq_pair(self, tokens_b, max_length):
+        """Truncates a sequence pair in place to the maximum length.
+        This is a simple heuristic which will always truncate the longer sequence
+        one token at a time. This makes more sense than truncating an equal percent
+        of tokens from each, since if one sequence is very short then each token
+        that's truncated likely contains more information than a longer sequence.
+        """
+        while True:
+            total_length = len(tokens_b)
+            if total_length <= max_length:
+                break
+            tokens_b.pop()
+
+    def random_word(self, tokens, tokenizer):
+        """
+        Masking some random tokens for Language Model task with probabilities as in the original BERT paper.
+        Args:
+            tokens: list of str, tokenized sentence.
+            tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)
+        Return:
+            (list of str, list of int), masked tokens and related labels for LM prediction
+        """
+        output_label = []
+
+        for i, token in enumerate(tokens):
+            prob = random.random()
+            # mask token with 15% probability
+
+            if prob < 0.15:
+                prob /= 0.15
+
+                # 80% randomly change token to mask token
+                if prob < 0.8:
+                    tokens[i] = "[MASK]"
+
+                # 10% randomly change token to random token
+                elif prob < 0.9:
+                    #tok = random.choice(list(tokenizer.vocab.items()))[0]
+                    tok = tokenizer.vocab.idx_to_token[random.randint(
+                        0,
+                        tokenizer.vocab_size,
+                    )]
+                    tokens[i] = tok
+
+                # rest 10% randomly keep current token
+                # append current token to output (we will predict these later)
+                try:
+                    output_label.append(tokenizer.vocab[token])
+                except KeyError:
+                    # For unknown words (should not occur with BPE vocab)
+                    output_label.append(tokenizer.vocab["[UNK]"])
+                    print(
+                        "Cannot find token '{}' in vocab. Using [UNK] insetad".
+                        format(token))
+            else:
+                # no masking token (will be ignored by loss function later)
+                output_label.append(-1)
+
+        return tokens, output_label
+
+    def random_region(self, image_feat, image_loc, num_boxes):
+        output_label = []
+
+        for i in range(num_boxes):
+            prob = random.random()
+            # mask token with 15% probability
+            if prob < 0.15:
+                prob /= 0.15
+
+                # 80% randomly change token to mask token
+                if prob < 0.9:
+                    image_feat[i] = 0
+
+                # rest 20% randomly keep current token
+                # append current token to output (we will predict these later)
+                output_label.append(1)
+            else:
+                # no masking token (will be ignored by loss function later)
+                output_label.append(-1)
+
+        return image_feat, image_loc, output_label
+
+    def random_action(self, action_feat, action_target, num_actions):
+        output_label = []
+
+        for i in range(num_actions):
+            prob = random.random()
+            # mask token with 15% probability
+            if prob < 0.15:
+                prob /= 0.15
+
+                # 90% randomly change token to mask token
+                if prob < 0.9:
+                    action_feat[i] = 0
+
+                # rest 10% randomly keep current token
+                # append current token to output (we will predict these later)
+                output_label.append(action_target[i])
+            else:
+                # no masking token (will be ignored by loss function later)
+                output_label.append(-1)
+
+        return action_feat, output_label
+
+    def __call__(self, results):
+        caption = results['caption']
+        tokenizer = results['tokenizer']
+        image_feat = results['image_feat']
+        image_loc = results['image_loc']
+        num_boxes = results['num_boxes']
+        action_feat = results['action_feat']
+        action_target = results['action_target']
+        num_actions = results['num_actions']
+        is_next = results['is_next']
+        image_target = results['image_target']
+
+        self._truncate_seq_pair(caption, self.max_seq_length - 2)
+        caption, caption_label = self.random_word(caption, tokenizer)
+
+        image_feat, image_loc, image_label = self.random_region(
+            image_feat, image_loc, num_boxes)
+        action_feat, action_label = self.random_action(action_feat,
+                                                       action_target,
+                                                       num_actions)
+
+        # concatenate lm labels and account for CLS, SEP, SEP
+        lm_label_ids = [-1] + caption_label + [-1]
+
+        # The convention in BERT is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids: 0   0   0   0  0     0 0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambigiously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+
+        tokens = []
+        segment_ids = []
+
+        tokens.append("[CLS]")
+        segment_ids.append(0)
+
+        for token in caption:
+            tokens.append(token)
+            segment_ids.append(0)
+        tokens.append("[SEP]")
+        segment_ids.append(0)
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
+        input_mask = [1] * (len(input_ids))
+        image_mask = [1] * (num_boxes)
+        action_mask = [1] * (num_actions)
+
+        # Zero-pad up to the visual sequence length.
+        while len(image_mask) < self.max_region_length:
+            image_mask.append(0)
+            image_label.append(-1)
+        while len(action_mask) < self.max_action_length:
+            action_mask.append(0)
+            action_label.append(-1)
+
+        # Zero-pad up to the sequence length.
+        while len(input_ids) < self.max_seq_length:
+            input_ids.append(0)
+            input_mask.append(0)
+            segment_ids.append(0)
+            lm_label_ids.append(-1)
+
+        assert len(input_ids) == self.max_seq_length
+        assert len(input_mask) == self.max_seq_length
+        assert len(segment_ids) == self.max_seq_length
+        assert len(lm_label_ids) == self.max_seq_length
+        assert len(image_mask) == self.max_region_length
+        assert len(image_label) == self.max_region_length
+        assert len(action_mask) == self.max_action_length
+        assert len(action_label) == self.max_action_length
+
+        image_feat, image_loc, image_mask = self.get_image_global_feature(
+            image_feat, image_loc, np.array(image_mask))
+        features = [
+            np.array(input_ids),
+            action_feat,
+            image_feat,
+            image_loc,
+            np.array(segment_ids),
+            np.array(input_mask),
+            image_mask,
+            np.array(action_mask),
+            np.array(lm_label_ids),
+            np.array(action_label),
+            np.array(is_next),
+            np.array(image_label),
+            image_target,
+        ]
+        results['features'] = features
+        return results
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample.py
@ -0,0 +1,382 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+
+import numpy as np
+from PIL import Image
+try:
+    import SimpleITK as sitk
+except ImportError as e:
+    print(
+        f"Warning! {e}, [SimpleITK] package and it's dependencies is required for PP-Care."
+    )
+import cv2
+
+from ..registry import PIPELINES
+
+try:
+    import cPickle as pickle
+    from cStringIO import StringIO
+except ImportError:
+    import pickle
+    from io import BytesIO
+
+
+@PIPELINES.register()
+class Sampler(object):
+    """
+    Sample frames id.
+    NOTE: Use PIL to read image here, has diff with CV2
+    Args:
+        num_seg(int): number of segments.
+        seg_len(int): number of sampled frames in each segment.
+        valid_mode(bool): True or False.
+        select_left: Whether to select the frame to the left in the middle when the sampling interval is even in the test mode.
+    Returns:
+        frames_idx: the index of sampled #frames.
+    """
+    def __init__(self,
+                 num_seg,
+                 seg_len,
+                 frame_interval=None,
+                 valid_mode=False,
+                 select_left=False,
+                 dense_sample=False,
+                 linspace_sample=False,
+                 use_pil=True):
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.frame_interval = frame_interval
+        self.valid_mode = valid_mode
+        self.select_left = select_left
+        self.dense_sample = dense_sample
+        self.linspace_sample = linspace_sample
+        self.use_pil = use_pil
+
+    def _get(self, frames_idx, results):
+        data_format = results['format']
+
+        if data_format == "frame":
+            frame_dir = results['frame_dir']
+            imgs = []
+            for idx in frames_idx:
+                img = Image.open(
+                    os.path.join(frame_dir,
+                                 results['suffix'].format(idx))).convert('RGB')
+                imgs.append(img)
+
+        elif data_format == "MRI":
+            frame_dir = results['frame_dir']
+            imgs = []
+            MRI = sitk.GetArrayFromImage(sitk.ReadImage(frame_dir))
+            for idx in frames_idx:
+                item = MRI[idx]
+                item = cv2.resize(item, (224, 224))
+                imgs.append(item)
+
+        elif data_format == "video":
+            if results['backend'] == 'cv2':
+                frames = np.array(results['frames'])
+                imgs = []
+                for idx in frames_idx:
+                    imgbuf = frames[idx]
+                    img = Image.fromarray(imgbuf, mode='RGB')
+                    imgs.append(img)
+            elif results['backend'] == 'decord':
+                container = results['frames']
+                if self.use_pil:
+                    frames_select = container.get_batch(frames_idx)
+                    # dearray_to_img
+                    np_frames = frames_select.asnumpy()
+                    imgs = []
+                    for i in range(np_frames.shape[0]):
+                        imgbuf = np_frames[i]
+                        imgs.append(Image.fromarray(imgbuf, mode='RGB'))
+                else:
+                    if frames_idx.ndim != 1:
+                        frames_idx = np.squeeze(frames_idx)
+                    frame_dict = {
+                        idx: container[idx].asnumpy()
+                        for idx in np.unique(frames_idx)
+                    }
+                    imgs = [frame_dict[idx] for idx in frames_idx]
+            elif results['backend'] == 'pyav':
+                imgs = []
+                frames = np.array(results['frames'])
+                for idx in frames_idx:
+                    if self.dense_sample:
+                        idx = idx - 1
+                    imgbuf = frames[idx]
+                    imgs.append(imgbuf)
+                imgs = np.stack(imgs)  # thwc
+            else:
+                raise NotImplementedError
+        else:
+            raise NotImplementedError
+        results['imgs'] = imgs
+        return results
+
+    def _get_train_clips(self, num_frames):
+        ori_seg_len = self.seg_len * self.frame_interval
+        avg_interval = (num_frames - ori_seg_len + 1) // self.num_seg
+
+        if avg_interval > 0:
+            base_offsets = np.arange(self.num_seg) * avg_interval
+            clip_offsets = base_offsets + np.random.randint(avg_interval,
+                                                            size=self.num_seg)
+        elif num_frames > max(self.num_seg, ori_seg_len):
+            clip_offsets = np.sort(
+                np.random.randint(num_frames - ori_seg_len + 1,
+                                  size=self.num_seg))
+        elif avg_interval == 0:
+            ratio = (num_frames - ori_seg_len + 1.0) / self.num_seg
+            clip_offsets = np.around(np.arange(self.num_seg) * ratio)
+        else:
+            clip_offsets = np.zeros((self.num_seg, ), dtype=np.int)
+        return clip_offsets
+
+    def _get_test_clips(self, num_frames):
+        ori_seg_len = self.seg_len * self.frame_interval
+        avg_interval = (num_frames - ori_seg_len + 1) / float(self.num_seg)
+        if num_frames > ori_seg_len - 1:
+            base_offsets = np.arange(self.num_seg) * avg_interval
+            clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int)
+        else:
+            clip_offsets = np.zeros((self.num_seg, ), dtype=np.int)
+        return clip_offsets
+
+    def __call__(self, results):
+        """
+        Args:
+            frames_len: length of frames.
+        return:
+            sampling id.
+        """
+        frames_len = int(results['frames_len'])
+        frames_idx = []
+        if self.frame_interval is not None:
+            assert isinstance(self.frame_interval, int)
+            if not self.valid_mode:
+                offsets = self._get_train_clips(frames_len)
+            else:
+                offsets = self._get_test_clips(frames_len)
+
+            offsets = offsets[:, None] + np.arange(
+                self.seg_len)[None, :] * self.frame_interval
+            offsets = np.concatenate(offsets)
+
+            offsets = offsets.reshape((-1, self.seg_len))
+            offsets = np.mod(offsets, frames_len)
+            offsets = np.concatenate(offsets)
+
+            if results['format'] == 'video':
+                frames_idx = offsets
+            elif results['format'] == 'frame':
+                frames_idx = list(offsets + 1)
+            else:
+                raise NotImplementedError
+
+            return self._get(frames_idx, results)
+
+        if self.linspace_sample:
+            if 'start_idx' in results and 'end_idx' in results:
+                offsets = np.linspace(results['start_idx'], results['end_idx'],
+                                      self.num_seg)
+            else:
+                offsets = np.linspace(0, frames_len - 1, self.num_seg)
+            offsets = np.clip(offsets, 0, frames_len - 1).astype(np.int64)
+            if results['format'] == 'video':
+                frames_idx = list(offsets)
+                frames_idx = [x % frames_len for x in frames_idx]
+            elif results['format'] == 'frame':
+                frames_idx = list(offsets + 1)
+
+            elif results['format'] == 'MRI':
+                frames_idx = list(offsets)
+
+            else:
+                raise NotImplementedError
+            return self._get(frames_idx, results)
+
+        average_dur = int(frames_len / self.num_seg)
+        if not self.select_left:
+            if self.dense_sample:  # For ppTSM
+                if not self.valid_mode:  # train
+                    sample_pos = max(1, 1 + frames_len - 64)
+                    t_stride = 64 // self.num_seg
+                    start_idx = 0 if sample_pos == 1 else np.random.randint(
+                        0, sample_pos - 1)
+                    offsets = [(idx * t_stride + start_idx) % frames_len + 1
+                               for idx in range(self.num_seg)]
+                    frames_idx = offsets
+                else:
+                    sample_pos = max(1, 1 + frames_len - 64)
+                    t_stride = 64 // self.num_seg
+                    start_list = np.linspace(0,
+                                             sample_pos - 1,
+                                             num=10,
+                                             dtype=int)
+                    offsets = []
+                    for start_idx in start_list.tolist():
+                        offsets += [
+                            (idx * t_stride + start_idx) % frames_len + 1
+                            for idx in range(self.num_seg)
+                        ]
+                    frames_idx = offsets
+            else:
+                for i in range(self.num_seg):
+                    idx = 0
+                    if not self.valid_mode:
+                        if average_dur >= self.seg_len:
+                            idx = random.randint(0, average_dur - self.seg_len)
+                            idx += i * average_dur
+                        elif average_dur >= 1:
+                            idx += i * average_dur
+                        else:
+                            idx = i
+                    else:
+                        if average_dur >= self.seg_len:
+                            idx = (average_dur - 1) // 2
+                            idx += i * average_dur
+                        elif average_dur >= 1:
+                            idx += i * average_dur
+                        else:
+                            idx = i
+                    for jj in range(idx, idx + self.seg_len):
+                        if results['format'] == 'video':
+                            frames_idx.append(int(jj % frames_len))
+                        elif results['format'] == 'frame':
+                            frames_idx.append(jj + 1)
+
+                        elif results['format'] == 'MRI':
+                            frames_idx.append(jj)
+                        else:
+                            raise NotImplementedError
+            return self._get(frames_idx, results)
+
+        else:  # for TSM
+            if not self.valid_mode:
+                if average_dur > 0:
+                    offsets = np.multiply(list(range(self.num_seg)),
+                                          average_dur) + np.random.randint(
+                                              average_dur, size=self.num_seg)
+                elif frames_len > self.num_seg:
+                    offsets = np.sort(
+                        np.random.randint(frames_len, size=self.num_seg))
+                else:
+                    offsets = np.zeros(shape=(self.num_seg, ))
+            else:
+                if frames_len > self.num_seg:
+                    average_dur_float = frames_len / self.num_seg
+                    offsets = np.array([
+                        int(average_dur_float / 2.0 + average_dur_float * x)
+                        for x in range(self.num_seg)
+                    ])
+                else:
+                    offsets = np.zeros(shape=(self.num_seg, ))
+
+            if results['format'] == 'video':
+                frames_idx = list(offsets)
+                frames_idx = [x % frames_len for x in frames_idx]
+            elif results['format'] == 'frame':
+                frames_idx = list(offsets + 1)
+
+            elif results['format'] == 'MRI':
+                frames_idx = list(offsets)
+
+            else:
+                raise NotImplementedError
+
+            return self._get(frames_idx, results)
+
+
+@PIPELINES.register()
+class SamplerPkl(object):
+    """
+    Sample frames id.
+    NOTE: Use PIL to read image here, has diff with CV2
+    Args:
+        num_seg(int): number of segments.
+        seg_len(int): number of sampled frames in each segment.
+        mode(str): 'train', 'valid'
+    Returns:
+        frames_idx: the index of sampled #frames.
+    """
+    def __init__(self, num_seg, seg_len, backend='pillow', valid_mode=False):
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.valid_mode = valid_mode
+        self.backend = backend
+
+    def _get(self, buf):
+        if isinstance(buf, str):
+            img = Image.open(StringIO(buf))
+        else:
+            img = Image.open(BytesIO(buf))
+        img = img.convert('RGB')
+        if self.backend != 'pillow':
+            img = np.array(img)
+        return img
+
+    def __call__(self, results):
+        """
+        Args:
+            frames_len: length of frames.
+        return:
+            sampling id.
+        """
+        filename = results['frame_dir']
+        data_loaded = pickle.load(open(filename, 'rb'), encoding='bytes')
+        video_name, label, frames = data_loaded
+        if isinstance(label, dict):
+            label = label['动作类型']
+            results['labels'] = label
+        elif len(label) == 1:
+            results['labels'] = int(label[0])
+        else:
+            results['labels'] = int(label[0]) if random.random() < 0.5 else int(
+                label[1])
+        results['frames_len'] = len(frames)
+        frames_len = results['frames_len']
+        average_dur = int(int(frames_len) / self.num_seg)
+        imgs = []
+        for i in range(self.num_seg):
+            idx = 0
+            if not self.valid_mode:
+                if average_dur >= self.seg_len:
+                    idx = random.randint(0, average_dur - self.seg_len)
+                    idx += i * average_dur
+                elif average_dur >= 1:
+                    idx += i * average_dur
+                else:
+                    idx = i
+            else:
+                if average_dur >= self.seg_len:
+                    idx = (average_dur - 1) // 2
+                    idx += i * average_dur
+                elif average_dur >= 1:
+                    idx += i * average_dur
+                else:
+                    idx = i
+
+            for jj in range(idx, idx + self.seg_len):
+                imgbuf = frames[int(jj % results['frames_len'])]
+                img = self._get(imgbuf)
+                imgs.append(img)
+        results['backend'] = self.backend
+        results['imgs'] = imgs
+
+        return results
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample_ava.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample_ava.py
@ -0,0 +1,375 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+from PIL import Image
+from ..registry import PIPELINES
+import os
+import numpy as np
+import io
+import os.path as osp
+from abc import ABCMeta, abstractmethod
+import cv2
+from cv2 import IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_UNCHANGED
+import inspect
+
+imread_backend = 'cv2'
+imread_flags = {
+    'color': IMREAD_COLOR,
+    'grayscale': IMREAD_GRAYSCALE,
+    'unchanged': IMREAD_UNCHANGED
+}
+
+
+@PIPELINES.register()
+class SampleFrames:
+    """Sample frames from the video. """
+
+    def __init__(self,
+                 clip_len,
+                 frame_interval=1,
+                 num_clips=1,
+                 temporal_jitter=False,
+                 twice_sample=False,
+                 out_of_bound_opt='loop',
+                 test_mode=False):
+        self.clip_len = clip_len
+        self.frame_interval = frame_interval
+        self.num_clips = num_clips
+        self.temporal_jitter = temporal_jitter
+        self.twice_sample = twice_sample
+        self.out_of_bound_opt = out_of_bound_opt
+        self.test_mode = test_mode
+        assert self.out_of_bound_opt in ['loop', 'repeat_last']
+
+    def _get_train_clips(self, num_frames):
+        """Get clip offsets in train mode. """
+        ori_clip_len = self.clip_len * self.frame_interval
+        avg_interval = (num_frames - ori_clip_len + 1) // self.num_clips
+        if avg_interval > 0:
+            base_offsets = np.arange(self.num_clips) * avg_interval
+            clip_offsets = base_offsets + np.random.randint(
+                avg_interval, size=self.num_clips)
+        elif num_frames > max(self.num_clips, ori_clip_len):
+            clip_offsets = np.sort(
+                np.random.randint(
+                    num_frames - ori_clip_len + 1, size=self.num_clips))
+        elif avg_interval == 0:
+            ratio = (num_frames - ori_clip_len + 1.0) / self.num_clips
+            clip_offsets = np.around(np.arange(self.num_clips) * ratio)
+        else:
+            clip_offsets = np.zeros((self.num_clips, ), dtype=np.int)
+        return clip_offsets
+
+    def _get_test_clips(self, num_frames):
+        """Get clip offsets in test mode. """
+        ori_clip_len = self.clip_len * self.frame_interval
+        avg_interval = (num_frames - ori_clip_len + 1) / float(self.num_clips)
+        if num_frames > ori_clip_len - 1:
+            base_offsets = np.arange(self.num_clips) * avg_interval
+            clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int)
+            if self.twice_sample:
+                clip_offsets = np.concatenate([clip_offsets, base_offsets])
+        else:
+            clip_offsets = np.zeros((self.num_clips, ), dtype=np.int)
+        return clip_offsets
+
+    def _sample_clips(self, num_frames):
+        """Choose clip offsets for the video in a given mode. """
+        if self.test_mode:
+            clip_offsets = self._get_test_clips(num_frames)
+        else:
+            clip_offsets = self._get_train_clips(num_frames)
+        return clip_offsets
+
+    def __call__(self, results):
+        """Perform the SampleFrames loading. """
+        total_frames = results['total_frames']
+        clip_offsets = self._sample_clips(total_frames)
+        frame_inds = clip_offsets[:, None] + np.arange(
+            self.clip_len)[None, :] * self.frame_interval
+        frame_inds = np.concatenate(frame_inds)
+        if self.temporal_jitter:
+            perframe_offsets = np.random.randint(
+                self.frame_interval, size=len(frame_inds))
+            frame_inds += perframe_offsets
+        frame_inds = frame_inds.reshape((-1, self.clip_len))
+        if self.out_of_bound_opt == 'loop':
+            frame_inds = np.mod(frame_inds, total_frames)
+        elif self.out_of_bound_opt == 'repeat_last':
+            safe_inds = frame_inds < total_frames
+            unsafe_inds = 1 - safe_inds
+            last_ind = np.max(safe_inds * frame_inds, axis=1)
+            new_inds = (safe_inds * frame_inds + (unsafe_inds.T * last_ind).T)
+            frame_inds = new_inds
+        else:
+            raise ValueError('Illegal out_of_bound option.')
+        start_index = results['start_index']
+        frame_inds = np.concatenate(frame_inds) + start_index
+        results['frame_inds'] = frame_inds.astype(np.int)
+        results['clip_len'] = self.clip_len
+        results['frame_interval'] = self.frame_interval
+        results['num_clips'] = self.num_clips
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'clip_len={self.clip_len}, '
+                    f'frame_interval={self.frame_interval}, '
+                    f'num_clips={self.num_clips}, '
+                    f'temporal_jitter={self.temporal_jitter}, '
+                    f'twice_sample={self.twice_sample}, '
+                    f'out_of_bound_opt={self.out_of_bound_opt}, '
+                    f'test_mode={self.test_mode})')
+        return repr_str
+
+class BaseStorageBackend(metaclass=ABCMeta):
+    """Abstract class of storage backends. """
+
+    @abstractmethod
+    def get(self, filepath):
+        pass
+
+    @abstractmethod
+    def get_text(self, filepath):
+        pass
+
+class HardDiskBackend(BaseStorageBackend):
+    """Raw hard disks storage backend."""
+
+    def get(self, filepath):
+        filepath = str(filepath)
+        with open(filepath, 'rb') as f:
+            value_buf = f.read()
+        return value_buf
+
+    def get_text(self, filepath):
+        filepath = str(filepath)
+        with open(filepath, 'r') as f:
+            value_buf = f.read()
+        return value_buf
+
+class FileClient:
+    """A general file client to access files in different backend. """
+
+    _backends = {
+        'disk': HardDiskBackend,
+    }
+
+    def __init__(self, backend='disk', **kwargs):
+        if backend not in self._backends:
+            raise ValueError(
+                f'Backend {backend} is not supported. Currently supported ones'
+                f' are {list(self._backends.keys())}')
+        self.backend = backend
+        self.client = self._backends[backend](**kwargs)
+
+    @classmethod
+    def _register_backend(cls, name, backend, force=False):
+        if not isinstance(name, str):
+            raise TypeError('the backend name should be a string, '
+                            f'but got {type(name)}')
+        if not inspect.isclass(backend):
+            raise TypeError(
+                f'backend should be a class but got {type(backend)}')
+        if not issubclass(backend, BaseStorageBackend):
+            raise TypeError(
+                f'backend {backend} is not a subclass of BaseStorageBackend')
+        if not force and name in cls._backends:
+            raise KeyError(
+                f'{name} is already registered as a storage backend, '
+                'add "force=True" if you want to override it')
+
+        cls._backends[name] = backend
+
+    @classmethod
+    def register_backend(cls, name, backend=None, force=False):
+        """Register a backend to FileClient. """
+
+        if backend is not None:
+            cls._register_backend(name, backend, force=force)
+            return
+
+        def _register(backend_cls):
+            cls._register_backend(name, backend_cls, force=force)
+            return backend_cls
+
+        return _register
+
+    def get(self, filepath):
+        return self.client.get(filepath)
+
+    def get_text(self, filepath):
+        return self.client.get_text(filepath)
+
+@PIPELINES.register()
+class RawFrameDecode:
+    """Load and decode frames with given indices. """
+
+    def __init__(self, io_backend='disk', decoding_backend='cv2', **kwargs):
+        self.io_backend = io_backend
+        self.decoding_backend = decoding_backend
+        self.kwargs = kwargs
+        self.file_client = None
+
+    def _pillow2array(self,img, flag='color', channel_order='bgr'):
+        """Convert a pillow image to numpy array. """
+
+        channel_order = channel_order.lower()
+        if channel_order not in ['rgb', 'bgr']:
+            raise ValueError('channel order must be either "rgb" or "bgr"')
+
+        if flag == 'unchanged':
+            array = np.array(img)
+            if array.ndim >= 3 and array.shape[2] >= 3:  # color image
+                array[:, :, :3] = array[:, :, (2, 1, 0)]  # RGB to BGR
+        else:
+            # If the image mode is not 'RGB', convert it to 'RGB' first.
+            if img.mode != 'RGB':
+                if img.mode != 'LA':
+                    # Most formats except 'LA' can be directly converted to RGB
+                    img = img.convert('RGB')
+                else:
+                    # When the mode is 'LA', the default conversion will fill in
+                    #  the canvas with black, which sometimes shadows black objects
+                    #  in the foreground.
+                    #
+                    # Therefore, a random color (124, 117, 104) is used for canvas
+                    img_rgba = img.convert('RGBA')
+                    img = Image.new('RGB', img_rgba.size, (124, 117, 104))
+                    img.paste(img_rgba, mask=img_rgba.split()[3])  # 3 is alpha
+            if flag == 'color':
+                array = np.array(img)
+                if channel_order != 'rgb':
+                    array = array[:, :, ::-1]  # RGB to BGR
+            elif flag == 'grayscale':
+                img = img.convert('L')
+                array = np.array(img)
+            else:
+                raise ValueError(
+                    'flag must be "color", "grayscale" or "unchanged", '
+                    f'but got {flag}')
+        return array
+
+    def _imfrombytes(self,content, flag='color', channel_order='bgr'):#, backend=None):
+        """Read an image from bytes. """
+
+        img_np = np.frombuffer(content, np.uint8)
+        flag = imread_flags[flag] if isinstance(flag, str) else flag
+        img = cv2.imdecode(img_np, flag)
+        if flag == IMREAD_COLOR and channel_order == 'rgb':
+            cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+        return img
+
+    def __call__(self, results):
+        """Perform the ``RawFrameDecode`` to pick frames given indices.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        # mmcv.use_backend(self.decoding_backend)
+
+        directory = results['frame_dir']
+        suffix = results['suffix']
+        #modality = results['modality']
+
+        if self.file_client is None:
+            self.file_client = FileClient(self.io_backend, **self.kwargs)
+
+        imgs = list()
+
+        if results['frame_inds'].ndim != 1:
+            results['frame_inds'] = np.squeeze(results['frame_inds'])
+
+        offset = results.get('offset', 0)
+
+        for frame_idx in results['frame_inds']:
+            frame_idx += offset
+            filepath = osp.join(directory, suffix.format(frame_idx))
+            img_bytes = self.file_client.get(filepath) #以二进制方式读取图片
+            # Get frame with channel order RGB directly.
+
+            cur_frame = self._imfrombytes(img_bytes, channel_order='rgb')
+            imgs.append(cur_frame)
+
+        results['imgs'] = imgs
+        results['original_shape'] = imgs[0].shape[:2]
+        results['img_shape'] = imgs[0].shape[:2]
+
+        # we resize the gt_bboxes and proposals to their real scale
+        h, w = results['img_shape']
+        scale_factor = np.array([w, h, w, h])
+        if 'gt_bboxes' in results:
+            gt_bboxes = results['gt_bboxes']
+            gt_bboxes_new = (gt_bboxes * scale_factor).astype(np.float32)
+            results['gt_bboxes'] = gt_bboxes_new
+        if 'proposals' in results and results['proposals'] is not None:
+            proposals = results['proposals']
+            proposals = (proposals * scale_factor).astype(np.float32)
+            results['proposals'] = proposals
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'io_backend={self.io_backend}, '
+                    f'decoding_backend={self.decoding_backend})')
+        return repr_str
+
+@PIPELINES.register()
+class SampleAVAFrames(SampleFrames):
+
+    def __init__(self, clip_len, frame_interval=2, test_mode=False):
+
+        super().__init__(clip_len, frame_interval, test_mode=test_mode)
+
+    def _get_clips(self, center_index, skip_offsets, shot_info):
+        start = center_index - (self.clip_len // 2) * self.frame_interval
+        end = center_index + ((self.clip_len + 1) // 2) * self.frame_interval
+        frame_inds = list(range(start, end, self.frame_interval))
+        frame_inds = frame_inds + skip_offsets
+        frame_inds = np.clip(frame_inds, shot_info[0], shot_info[1] - 1)
+
+        return frame_inds
+
+    def __call__(self, results):
+        fps = results['fps']
+        timestamp = results['timestamp']
+        timestamp_start = results['timestamp_start']
+        shot_info = results['shot_info']
+
+        #delta=(timestamp - timestamp_start) 为该帧距离15min视频开头有几秒
+        #center_index=fps*delta为该帧距离15min视频开头有几帧
+        #center_index+1是为了避免后续采样时出现负数? 
+        #后续需要以center_index为中心前后采样视频帧片段
+        center_index = fps * (timestamp - timestamp_start) + 1
+
+        skip_offsets = np.random.randint(
+            -self.frame_interval // 2, (self.frame_interval + 1) // 2,
+            size=self.clip_len)
+        frame_inds = self._get_clips(center_index, skip_offsets, shot_info)
+
+        results['frame_inds'] = np.array(frame_inds, dtype=np.int)
+        results['clip_len'] = self.clip_len
+        results['frame_interval'] = self.frame_interval
+        results['num_clips'] = 1
+        results['crop_quadruple'] = np.array([0, 0, 1, 1], dtype=np.float32)
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'clip_len={self.clip_len}, '
+                    f'frame_interval={self.frame_interval}, '
+                    f'test_mode={self.test_mode})')
+        return repr_str
+
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample_ucf24.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample_ucf24.py
@ -0,0 +1,69 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+
+from PIL import Image
+
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class SamplerUCF24(object):
+    """
+    Sample frames id.
+    NOTE: Use PIL to read image here, has diff with CV2
+    Args:
+        num_frames(int): The amount of frames used in a video
+        frame_interval(int): Sampling rate
+        valid_mode(bool): True or False.
+    Returns:
+        frames_idx: the index of sampled #frames.
+    """
+    def __init__(self,
+                 num_frames=16,
+                 frame_interval=1,
+                 valid_mode=False):
+        self.num_frames = num_frames
+        self.frame_interval = frame_interval if valid_mode else random.randint(1, 2)
+        self.valid_mode = valid_mode
+
+    def _get(self, frames_idxs, img_folder, results):
+        imgs = []
+        for idx in frames_idxs:
+            img = Image.open(
+                os.path.join(img_folder, '{:05d}.jpg'.format(idx))).convert('RGB')
+            imgs.append(img)
+        results['imgs'] = imgs
+        return results
+
+    def _make_clip(self, im_ind, max_num):
+        frame_idxs = []
+        for i in reversed(range(self.num_frames)):
+            # make it as a loop
+            i_temp = im_ind - i * self.frame_interval
+            if i_temp < 1:
+                i_temp = 1
+            elif i_temp > max_num:
+                i_temp = max_num
+            frame_idxs.append(i_temp)
+        return frame_idxs
+
+    def __call__(self, results):
+        img_folder, key_frame = os.path.split(results['filename'])
+        frame_len = len(os.listdir(img_folder))
+        key_idx = int(key_frame[0:5])
+        frame_idxs = self._make_clip(key_idx, frame_len)
+        return self._get(frame_idxs, img_folder, results)
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/segmentation.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/segmentation.py
@ -0,0 +1,130 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from PIL import Image
+import copy
+import cv2
+from ..registry import PIPELINES
+
+
+@PIPELINES.register()
+class MultiRestrictSize(object):
+    def __init__(self,
+                 min_size=None,
+                 max_size=800,
+                 flip=False,
+                 multi_scale=[1.3]):
+        self.min_size = min_size
+        self.max_size = max_size
+        self.multi_scale = multi_scale
+        self.flip = flip
+        assert ((min_size is None)) or ((max_size is None))
+
+    def __call__(self, sample):
+        samples = []
+        image = sample['current_img']
+        h, w = image.shape[:2]
+        for scale in self.multi_scale:
+            # Fixed range of scales
+            sc = None
+            # Align short edge
+            if not (self.min_size is None):
+                if h > w:
+                    short_edge = w
+                else:
+                    short_edge = h
+                if short_edge > self.min_size:
+                    sc = float(self.min_size) / short_edge
+            else:
+                if h > w:
+                    long_edge = h
+                else:
+                    long_edge = w
+                if long_edge > self.max_size:
+                    sc = float(self.max_size) / long_edge
+
+            if sc is None:
+                new_h = h
+                new_w = w
+            else:
+                new_h = sc * h
+                new_w = sc * w
+            new_h = int(new_h * scale)
+            new_w = int(new_w * scale)
+
+            if (new_h - 1) % 16 != 0:
+                new_h = int(np.around((new_h - 1) / 16.) * 16 + 1)
+            if (new_w - 1) % 16 != 0:
+                new_w = int(np.around((new_w - 1) / 16.) * 16 + 1)
+
+            if new_h == h and new_w == w:
+                samples.append(sample)
+            else:
+                new_sample = {}
+                for elem in sample.keys():
+                    if 'meta' in elem:
+                        new_sample[elem] = sample[elem]
+                        continue
+                    tmp = sample[elem]
+                    if 'label' in elem:
+                        new_sample[elem] = sample[elem]
+                        continue
+                    else:
+                        flagval = cv2.INTER_CUBIC
+                        tmp = cv2.resize(tmp,
+                                         dsize=(new_w, new_h),
+                                         interpolation=flagval)
+                        new_sample[elem] = tmp
+                samples.append(new_sample)
+
+            if self.flip:
+                now_sample = samples[-1]
+                new_sample = {}
+                for elem in now_sample.keys():
+                    if 'meta' in elem:
+                        new_sample[elem] = now_sample[elem].copy()
+                        new_sample[elem]['flip'] = True
+                        continue
+                    tmp = now_sample[elem]
+                    tmp = tmp[:, ::-1].copy()
+                    new_sample[elem] = tmp
+                samples.append(new_sample)
+
+        return samples
+
+
+@PIPELINES.register()
+class MultiNorm(object):
+    def __call__(self, samples):
+        for idx in range(len(samples)):
+            sample = samples[idx]
+            for elem in sample.keys():
+                if 'meta' in elem:
+                    continue
+                tmp = sample[elem]
+                if tmp is None:
+                    continue
+
+                if tmp.ndim == 2:
+                    tmp = tmp[:, :, np.newaxis]
+                else:
+                    tmp = tmp / 255.
+                    tmp -= (0.485, 0.456, 0.406)
+                    tmp /= (0.229, 0.224, 0.225)
+
+                tmp = tmp.transpose((2, 0, 1))
+                samples[idx][elem] = tmp
+
+        return samples
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/segmentation_pipline.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/segmentation_pipline.py
@ -0,0 +1,40 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+
+import os
+import numpy as np
+import random
+import paddle
+from ..registry import PIPELINES
+"""
+pipeline ops for Action Segmentation Dataset.
+"""
+
+
+@PIPELINES.register()
+class SegmentationSampler(object):
+
+    def __init__(self, sample_rate):
+        self.sample_rate = sample_rate
+
+    def __call__(self, results):
+        for key, data in results.items():
+            if len(data.shape) == 1:
+                data = data[::self.sample_rate]
+                results[key] = copy.deepcopy(data)
+            else:
+                data = data[:, ::self.sample_rate]
+                results[key] = copy.deepcopy(data)
+        return results
--- a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/skeleton_pipeline.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/skeleton_pipeline.py
--- a/Bank_second_part/detect_process/paddlevideo/loader/registry.py
+++ b/Bank_second_part/detect_process/paddlevideo/loader/registry.py
@ -0,0 +1,18 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils import Registry
+
+PIPELINES = Registry("pipeline")
+DATASETS = Registry("datasets")
--- a/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/init.py
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/init.py
@ -0,0 +1,3 @@
+from .anet_prop import ANETproposal
+
+__all__ = ['ANETproposal']
--- a/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/pycache/init.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/pycache/init.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/pycache/anet_prop.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/pycache/anet_prop.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/anet_prop.py
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/anet_prop.py
@ -0,0 +1,359 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import json
+import numpy as np
+import pandas as pd
+import urllib.request as urllib2
+from paddlevideo.utils import get_logger
+
+logger = get_logger("paddlevideo")
+
+
+class ANETproposal(object):
+    """
+    This class is used for calculating AR@N and AUC;
+    Code transfer from ActivityNet Gitub repository](https://github.com/activitynet/ActivityNet.git)
+    """
+    GROUND_TRUTH_FIELDS = ['database', 'taxonomy', 'version']
+    PROPOSAL_FIELDS = ['results', 'version', 'external_data']
+    API = 'http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/challenge19/api.py'
+
+    def __init__(self,
+                 ground_truth_filename=None,
+                 proposal_filename=None,
+                 ground_truth_fields=GROUND_TRUTH_FIELDS,
+                 proposal_fields=PROPOSAL_FIELDS,
+                 tiou_thresholds=np.linspace(0.5, 0.95, 10),
+                 max_avg_nr_proposals=None,
+                 subset='validation',
+                 verbose=False,
+                 check_status=True):
+        if not ground_truth_filename:
+            raise IOError('Please input a valid ground truth file.')
+        if not proposal_filename:
+            raise IOError('Please input a valid proposal file.')
+        self.subset = subset
+        self.tiou_thresholds = tiou_thresholds
+        self.max_avg_nr_proposals = max_avg_nr_proposals
+        self.verbose = verbose
+        self.gt_fields = ground_truth_fields
+        self.pred_fields = proposal_fields
+        self.recall = None
+        self.avg_recall = None
+        self.proposals_per_video = None
+        self.check_status = check_status
+        # Retrieve blocked videos from server.
+        if self.check_status:
+            self.blocked_videos = self.get_blocked_videos()
+        else:
+            self.blocked_videos = list()
+        # Import ground truth and proposals.
+        self.ground_truth, self.activity_index = self._import_ground_truth(
+            ground_truth_filename)
+        self.proposal = self._import_proposal(proposal_filename)
+
+        if self.verbose:
+            print('[INIT] Loaded annotations from {} subset.'.format(subset))
+            nr_gt = len(self.ground_truth)
+            print('\tNumber of ground truth instances: {}'.format(nr_gt))
+            nr_pred = len(self.proposal)
+            print('\tNumber of proposals: {}'.format(nr_pred))
+            print('\tFixed threshold for tiou score: {}'.format(
+                self.tiou_thresholds))
+
+    def _import_ground_truth(self, ground_truth_filename):
+        """
+        Reads ground truth file, checks if it is well formatted, and returns
+        the ground truth instances and the activity classes.
+
+        Parameters:
+        ground_truth_filename (str): full path to the ground truth json file.
+        Returns:
+        ground_truth (df): Data frame containing the ground truth instances.
+        activity_index (dict): Dictionary containing class index.
+        """
+        with open(ground_truth_filename, 'r') as fobj:
+            data = json.load(fobj)
+        # Checking format
+        if not all([field in data.keys() for field in self.gt_fields]):
+            raise IOError('Please input a valid ground truth file.')
+
+        # Read ground truth data.
+        activity_index, cidx = {}, 0
+        video_lst, t_start_lst, t_end_lst, label_lst = [], [], [], []
+        for videoid, v in data['database'].items():
+            if self.subset != v['subset']:
+                continue
+            if videoid in self.blocked_videos:
+                continue
+            for ann in v['annotations']:
+                if ann['label'] not in activity_index:
+                    activity_index[ann['label']] = cidx
+                    cidx += 1
+                video_lst.append(videoid)
+                t_start_lst.append(float(ann['segment'][0]))
+                t_end_lst.append(float(ann['segment'][1]))
+                label_lst.append(activity_index[ann['label']])
+
+        ground_truth = pd.DataFrame({
+            'video-id': video_lst,
+            't-start': t_start_lst,
+            't-end': t_end_lst,
+            'label': label_lst
+        })
+        return ground_truth, activity_index
+
+    def _import_proposal(self, proposal_filename):
+        """
+        Reads proposal file, checks if it is well formatted, and returns
+        the proposal instances.
+
+        Parameters:
+        proposal_filename (str): Full path to the proposal json file.
+        Returns:
+        proposal (df): Data frame containing the proposal instances.
+        """
+        with open(proposal_filename, 'r') as fobj:
+            data = json.load(fobj)
+        # Checking format...
+        if not all([field in data.keys() for field in self.pred_fields]):
+            raise IOError('Please input a valid proposal file.')
+
+        # Read predictions.
+        video_lst, t_start_lst, t_end_lst = [], [], []
+        score_lst = []
+        for videoid, v in data['results'].items():
+            if videoid in self.blocked_videos:
+                continue
+            for result in v:
+                video_lst.append(videoid)
+                t_start_lst.append(float(result['segment'][0]))
+                t_end_lst.append(float(result['segment'][1]))
+                score_lst.append(result['score'])
+        proposal = pd.DataFrame({
+            'video-id': video_lst,
+            't-start': t_start_lst,
+            't-end': t_end_lst,
+            'score': score_lst
+        })
+        return proposal
+
+    def evaluate(self):
+        """
+        Evaluates a proposal file. To measure the performance of a
+        method for the proposal task, we computes the area under the
+        average recall vs average number of proposals per video curve.
+        """
+        recall, avg_recall, proposals_per_video = self.average_recall_vs_avg_nr_proposals(
+            self.ground_truth,
+            self.proposal,
+            max_avg_nr_proposals=self.max_avg_nr_proposals,
+            tiou_thresholds=self.tiou_thresholds)
+
+        area_under_curve = np.trapz(avg_recall, proposals_per_video)
+
+        if self.verbose:
+            print('[RESULTS] Performance on ActivityNet proposal task.')
+            with open("data/bmn/BMN_Test_results/auc_result.txt",
+                      "a") as text_file:
+                text_file.write(
+                    '\tArea Under the AR vs AN curve: {}% \n'.format(
+                        100. * float(area_under_curve) /
+                        proposals_per_video[-1]))
+            print('\tArea Under the AR vs AN curve: {}%'.format(
+                100. * float(area_under_curve) / proposals_per_video[-1]))
+
+        self.recall = recall
+        self.avg_recall = avg_recall
+        self.proposals_per_video = proposals_per_video
+
+    def average_recall_vs_avg_nr_proposals(self,
+                                           ground_truth,
+                                           proposals,
+                                           max_avg_nr_proposals=None,
+                                           tiou_thresholds=np.linspace(
+                                               0.5, 0.95, 10)):
+        """
+        Computes the average recall given an average number of
+        proposals per video.
+
+        Parameters:
+        ground_truth(df): Data frame containing the ground truth instances.
+            Required fields: ['video-id', 't-start', 't-end']
+        proposal(df): Data frame containing the proposal instances.
+            Required fields: ['video-id, 't-start', 't-end', 'score']
+        tiou_thresholds(1d-array | optional): array with tiou thresholds.
+
+        Returns:
+        recall(2d-array): recall[i,j] is recall at ith tiou threshold at the jth
+            average number of average number of proposals per video.
+        average_recall(1d-array): recall averaged over a list of tiou threshold.
+            This is equivalent to recall.mean(axis=0).
+        proposals_per_video(1d-array): average number of proposals per video.
+        """
+
+        # Get list of videos.
+        video_lst = ground_truth['video-id'].unique()
+
+        if not max_avg_nr_proposals:
+            max_avg_nr_proposals = float(
+                proposals.shape[0]) / video_lst.shape[0]
+
+        ratio = max_avg_nr_proposals * float(
+            video_lst.shape[0]) / proposals.shape[0]
+
+        # Adaptation to query faster
+        ground_truth_gbvn = ground_truth.groupby('video-id')
+        proposals_gbvn = proposals.groupby('video-id')
+
+        # For each video, computes tiou scores among the retrieved proposals.
+        score_lst = []
+        total_nr_proposals = 0
+        for videoid in video_lst:
+            # Get ground-truth instances associated to this video.
+            ground_truth_videoid = ground_truth_gbvn.get_group(videoid)
+            this_video_ground_truth = ground_truth_videoid.loc[:, [
+                't-start', 't-end'
+            ]].values
+
+            # Get proposals for this video.
+            try:
+                proposals_videoid = proposals_gbvn.get_group(videoid)
+            except:
+                n = this_video_ground_truth.shape[0]
+                score_lst.append(np.zeros((n, 1)))
+                continue
+
+            this_video_proposals = proposals_videoid.loc[:,
+                                                         ['t-start', 't-end'
+                                                          ]].values
+
+            if this_video_proposals.shape[0] == 0:
+                n = this_video_ground_truth.shape[0]
+                score_lst.append(np.zeros((n, 1)))
+                continue
+
+            # Sort proposals by score.
+            sort_idx = proposals_videoid['score'].argsort()[::-1]
+            this_video_proposals = this_video_proposals[sort_idx, :]
+
+            if this_video_proposals.ndim != 2:
+                this_video_proposals = np.expand_dims(this_video_proposals,
+                                                      axis=0)
+            if this_video_ground_truth.ndim != 2:
+                this_video_ground_truth = np.expand_dims(
+                    this_video_ground_truth, axis=0)
+
+            nr_proposals = np.minimum(
+                int(this_video_proposals.shape[0] * ratio),
+                this_video_proposals.shape[0])
+            total_nr_proposals += nr_proposals
+            this_video_proposals = this_video_proposals[:nr_proposals, :]
+
+            # Compute tiou scores.
+            tiou = self.wrapper_segment_iou(this_video_proposals,
+                                            this_video_ground_truth)
+            score_lst.append(tiou)
+
+        # Given that the length of the videos is really varied, we
+        # compute the number of proposals in terms of a ratio of the total
+        # proposals retrieved, i.e. average recall at a percentage of proposals
+        # retrieved per video.
+
+        # Computes average recall.
+        pcn_lst = np.arange(1, 101) / 100.0 * (max_avg_nr_proposals * float(
+            video_lst.shape[0]) / total_nr_proposals)
+        matches = np.empty((video_lst.shape[0], pcn_lst.shape[0]))
+        positives = np.empty(video_lst.shape[0])
+        recall = np.empty((tiou_thresholds.shape[0], pcn_lst.shape[0]))
+        # Iterates over each tiou threshold.
+        for ridx, tiou in enumerate(tiou_thresholds):
+
+            # Inspect positives retrieved per video at different
+            # number of proposals (percentage of the total retrieved).
+            for i, score in enumerate(score_lst):
+                # Total positives per video.
+                positives[i] = score.shape[0]
+                # Find proposals that satisfies minimum tiou threshold.
+                true_positives_tiou = score >= tiou
+                # Get number of proposals as a percentage of total retrieved.
+                pcn_proposals = np.minimum(
+                    (score.shape[1] * pcn_lst).astype(int), score.shape[1])
+
+                for j, nr_proposals in enumerate(pcn_proposals):
+                    # Compute the number of matches for each percentage of the proposals
+                    matches[i, j] = np.count_nonzero(
+                        (true_positives_tiou[:, :nr_proposals]).sum(axis=1))
+
+            # Computes recall given the set of matches per video.
+            recall[ridx, :] = matches.sum(axis=0) / positives.sum()
+
+        # Recall is averaged.
+        avg_recall = recall.mean(axis=0)
+
+        # Get the average number of proposals per video.
+        proposals_per_video = pcn_lst * (float(total_nr_proposals) /
+                                         video_lst.shape[0])
+
+        return recall, avg_recall, proposals_per_video
+
+    def get_blocked_videos(self, api=API):
+        api_url = '{}?action=get_blocked'.format(api)
+        req = urllib2.Request(api_url)
+        response = urllib2.urlopen(req)
+        return json.loads(response.read())
+
+    def wrapper_segment_iou(self, target_segments, candidate_segments):
+        """
+        Compute intersection over union btw segments
+        Parameters:
+        target_segments(nd-array): 2-dim array in format [m x 2:=[init, end]]
+        candidate_segments(nd-array): 2-dim array in format [n x 2:=[init, end]]
+        Returns:
+        tiou(nd-array): 2-dim array [n x m] with IOU ratio.
+        Note: It assumes that candidate-segments are more scarce that target-segments
+        """
+        if candidate_segments.ndim != 2 or target_segments.ndim != 2:
+            raise ValueError('Dimension of arguments is incorrect')
+
+        n, m = candidate_segments.shape[0], target_segments.shape[0]
+        tiou = np.empty((n, m))
+        for i in range(m):
+            tiou[:, i] = self.segment_iou(target_segments[i, :],
+                                          candidate_segments)
+
+        return tiou
+
+    def segment_iou(self, target_segment, candidate_segments):
+        """
+        Compute the temporal intersection over union between a
+        target segment and all the test segments.
+
+        Parameters:
+        target_segment(1d-array): Temporal target segment containing [starting, ending] times.
+        candidate_segments(2d-array): Temporal candidate segments containing N x [starting, ending] times.
+
+        Returns:
+        tiou(1d-array): Temporal intersection over union score of the N's candidate segments.
+        """
+        tt1 = np.maximum(target_segment[0], candidate_segments[:, 0])
+        tt2 = np.minimum(target_segment[1], candidate_segments[:, 1])
+        # Intersection including Non-negative overlap score.
+        segments_intersection = (tt2 - tt1).clip(0)
+        # Segment union.
+        segments_union = (candidate_segments[:, 1] - candidate_segments[:, 0]) \
+                         + (target_segment[1] - target_segment[0]) - segments_intersection
+        # Compute overlap as the ratio of the intersection
+        # over union of two segments.
+        tIoU = segments_intersection.astype(float) / segments_union
+        return tIoU
--- a/Bank_second_part/detect_process/paddlevideo/metrics/init.py
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/init.py
@ -0,0 +1,36 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .bmn_metric import BMNMetric
+from .build import build_metric
+from .center_crop_metric import CenterCropMetric
+from .depth_metric import DepthMetric
+from .msrvtt_metric import MSRVTTMetric
+from .multi_crop_metric import MultiCropMetric
+from .registry import METRIC
+from .skeleton_metric import SkeletonMetric
+from .transnetv2_metric import TransNetV2Metric
+from .youtube8m.eval_util import HitOneMetric
+from .segmentation_metric import SegmentationMetric
+from .ava_metric import AVAMetric
+from .vos_metric import VOSMetric
+from .center_crop_metric_MRI import CenterCropMetric_MRI
+from .yowo_metric import YOWOMetric
+
+__all__ = [
+    'METRIC', 'build_metric', 'MultiCropMetric', 'BMNMetric',
+    'CenterCropMetric', 'SkeletonMetric', 'HitOneMetric', 'TransNetV2Metric',
+    'DepthMetric', 'MSRVTTMetric', 'VOSMetric', 'CenterCropMetric_MRI','AVAMetric',
+    'SegmentationMetric', 'YOWOMetric'
+]
--- a/Bank_second_part/detect_process/paddlevideo/metrics/pycache/init.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/pycache/init.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/metrics/pycache/ava_metric.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/pycache/ava_metric.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/metrics/pycache/ava_utils.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/pycache/ava_utils.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/metrics/pycache/base.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/pycache/base.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/metrics/pycache/bmn_metric.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/pycache/bmn_metric.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/metrics/pycache/build.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/pycache/build.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/metrics/pycache/center_crop_metric.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/pycache/center_crop_metric.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/metrics/pycache/center_crop_metric_MRI.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/pycache/center_crop_metric_MRI.cpython-310.pyc
--- a/Bank_second_part/detect_process/paddlevideo/metrics/pycache/depth_metric.cpython-310.pyc
+++ b/Bank_second_part/detect_process/paddlevideo/metrics/pycache/depth_metric.cpython-310.pyc
--- a/Show More
+++ b/Show More