XZNSH-Code-AI/Bank_second_part/detect_process/paddlevideo/modeling/backbones/yowo.py

# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ..registry import BACKBONES
from .darknet import Darknet
from .resnext101 import ResNext101
import paddle.nn as nn
import paddle


class CAM_Module(nn.Layer):
    def __init__(self, in_dim):
        super(CAM_Module, self).__init__()
        self.chanel_in = in_dim
        temp = paddle.zeros([1], dtype='float32')
        self.gamma = paddle.create_parameter(shape=temp.shape, dtype=str(temp.numpy().dtype),
                                             default_initializer=paddle.nn.initializer.Assign(temp))
        self.softmax = nn.Softmax(axis=-1)

    def forward(self, x):
        m_batchsize, C, height, width = x.shape
        proj_query = paddle.reshape(x, [m_batchsize, C, -1])
        proj_key = paddle.transpose(paddle.reshape(
            x, [m_batchsize, C, -1]), perm=[0, 2, 1])
        energy = paddle.bmm(proj_query, proj_key)
        energy_new = paddle.expand_as(paddle.max(
            energy, axis=-1, keepdim=True), energy) - energy
        attention = self.softmax(energy_new)
        proj_value = paddle.reshape(x, [m_batchsize, C, -1])

        out = paddle.bmm(attention, proj_value)
        out = out.reshape([m_batchsize, C, height, width])
        out = self.gamma * out + x
        return out


class CFAMBlock(nn.Layer):
    def __init__(self, in_channels, out_channels):
        super(CFAMBlock, self).__init__()
        inter_channels = 1024
        self.conv_bn_relu1 = nn.Sequential(nn.Conv2D(in_channels, inter_channels, kernel_size=1, bias_attr=False),
                                           nn.BatchNorm2D(inter_channels),
                                           nn.ReLU())
        self.conv_bn_relu2 = nn.Sequential(nn.Conv2D(inter_channels, inter_channels, 3, padding=1, bias_attr=False),
                                           nn.BatchNorm2D(inter_channels),
                                           nn.ReLU())

        self.sc = CAM_Module(inter_channels)

        self.conv_bn_relu3 = nn.Sequential(nn.Conv2D(inter_channels, inter_channels, 3, padding=1, bias_attr=False),
                                           nn.BatchNorm2D(inter_channels),
                                           nn.ReLU())
        self.conv_out = nn.Sequential(nn.Dropout2D(0.1), nn.Conv2D(
            inter_channels, out_channels, 1, bias_attr=True))

    def forward(self, x):
        x = self.conv_bn_relu1(x)
        x = self.conv_bn_relu2(x)
        x = self.sc(x)
        x = self.conv_bn_relu3(x)
        output = self.conv_out(x)

        return output


@BACKBONES.register()
class YOWO(nn.Layer):
    def __init__(self, num_class, pretrained_2d=None, pretrained_3d=None):
        super(YOWO, self).__init__()

        self.pretrained_2d = pretrained_2d
        self.pretrained_3d = pretrained_3d
        self.backbone_2d = Darknet()
        self.backbone_3d = ResNext101()
        self.num_ch_2d = 425
        self.num_ch_3d = 2048
        self.num_class = num_class
        self.cfam = CFAMBlock(self.num_ch_2d + self.num_ch_3d, 1024)
        self.conv_final = nn.Conv2D(
            1024, 5 * (self.num_class + 4 + 1), kernel_size=1, bias_attr=False)
        self.seen = 0

    def init_weights(self):
        if self.pretrained_2d is not None:
            self.backbone_2d = self.load_pretrain_weight(
                self.backbone_2d, self.pretrained_2d)
        if self.pretrained_3d is not None:
            self.backbone_3d = self.load_pretrain_weight(
                self.backbone_3d, self.pretrained_3d)

    def load_pretrain_weight(self, model, weights_path):
        model_dict = model.state_dict()

        param_state_dict = paddle.load(weights_path)
        ignore_weights = set()

        # hack: fit for faster rcnn. Pretrain weights contain prefix of 'backbone'
        # while res5 module is located in bbox_head.head. Replace the prefix of
        # res5 with 'bbox_head.head' to load pretrain weights correctly.
        for k in list(param_state_dict.keys()):
            if 'backbone.res5' in k:
                new_k = k.replace('backbone', 'bbox_head.head')
                if new_k in model_dict.keys():
                    value = param_state_dict.pop(k)
                    param_state_dict[new_k] = value

        for name, weight in param_state_dict.items():
            if name in model_dict.keys():
                if list(weight.shape) != list(model_dict[name].shape):
                    print(
                        '{} not used, shape {} unmatched with {} in model.'.format(
                            name, weight.shape, list(model_dict[name].shape)))
                    ignore_weights.add(name)
            else:
                print('Redundant weight {} and ignore it.'.format(name))
                ignore_weights.add(name)

        for weight in ignore_weights:
            param_state_dict.pop(weight, None)

        model.set_dict(param_state_dict)
        print('Finish loading model weights: {}'.format(weights_path))
        return model

    def forward(self, input):
        x_3d = input  # Input clip
        x_2d = input[:, :, -1, :, :]  # Last frame of the clip that is read

        x_2d = self.backbone_2d(x_2d)

        x_3d = self.backbone_3d(x_3d)

        x_3d = paddle.squeeze(x_3d, axis=2)

        x = paddle.concat([x_3d, x_2d], axis=1)
        x = self.cfam(x)
        out = self.conv_final(x)

        return out