import collections.abc from itertools import repeat from typing import Any, Callable, Optional, Tuple, Union import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle.nn.layer import Identity from ..registry import BACKBONES from collections import OrderedDict container_abcs = collections.abc """Model Config """ A0 = {'block_num': [0, 1, 3, 3, 4, 4]} A0['conv1'] = [3, 8, (1, 3, 3), (1, 2, 2), (0, 1, 1)] A0['b2_l0'] = [8, 8, 24, (1, 5, 5), (1, 2, 2), (0, 2, 2), (0, 1, 1)] A0['b3_l0'] = [8, 32, 80, (3, 3, 3), (1, 2, 2), (1, 0, 0), (0, 0, 0)] A0['b3_l1'] = [32, 32, 80, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)] A0['b3_l2'] = [32, 32, 80, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)] A0['b4_l0'] = [32, 56, 184, (5, 3, 3), (1, 2, 2), (2, 0, 0), (0, 0, 0)] A0['b4_l1'] = [56, 56, 112, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)] A0['b4_l2'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)] A0['b5_l0'] = [56, 56, 184, (5, 3, 3), (1, 1, 1), (2, 1, 1), (0, 1, 1)] A0['b5_l1'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)] A0['b5_l2'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)] A0['b5_l3'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)] A0['b6_l0'] = [56, 104, 384, (5, 3, 3), (1, 2, 2), (2, 1, 1), (0, 1, 1)] A0['b6_l1'] = [104, 104, 280, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1)] A0['b6_l2'] = [104, 104, 280, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1)] A0['b6_l3'] = [104, 104, 344, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1)] A0['conv7'] = [104, 480, (1, 1, 1), (1, 1, 1), (0, 0, 0)] MODEL_CONFIG = {'A0': A0} def _ntuple(n): def parse(x): if isinstance(x, container_abcs.Iterable): return x return tuple(repeat(x, n)) return parse def _make_divisible(v: float, divisor: int, min_value: Optional[int] = None) -> int: """ This function is taken from the original tf repo. It ensures that all layers have a channel number that is divisible by 8. It can be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py """ if min_value is None: min_value = divisor new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) # Make sure that round down does not go down by more than 10%. if new_v < 0.9 * v: new_v += divisor return new_v _single = _ntuple(1) _pair = _ntuple(2) _triple = _ntuple(3) _quadruple = _ntuple(4) class CausalModule(nn.Layer): def __init__(self) -> None: super().__init__() self.activation = None def reset_activation(self) -> None: self.activation = None class Conv2dBNActivation(nn.Sequential): def __init__( self, in_planes: int, out_planes: int, kernel_size: Union[int, Tuple[int, int]], padding: Union[int, Tuple[int, int]], stride: Union[int, Tuple[int, int]] = 1, groups: int = 1, norm_layer: Optional[Callable[..., nn.Layer]] = None, activation_layer: Optional[Callable[..., nn.Layer]] = None, **kwargs: Any, ) -> None: kernel_size = _pair(kernel_size) stride = _pair(stride) padding = _pair(padding) if norm_layer is None: norm_layer = Identity if activation_layer is None: activation_layer = Identity self.kernel_size = kernel_size self.stride = stride dict_layers = (nn.Conv2D(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, **kwargs), norm_layer(out_planes, momentum=0.1), activation_layer()) self.out_channels = out_planes super(Conv2dBNActivation, self).__init__(dict_layers[0], dict_layers[1], dict_layers[2]) class Conv3DBNActivation(nn.Sequential): def __init__( self, in_planes: int, out_planes: int, kernel_size: Union[int, Tuple[int, int, int]], padding: Union[int, Tuple[int, int, int]], stride: Union[int, Tuple[int, int, int]] = 1, groups: int = 1, norm_layer: Optional[Callable[..., nn.Layer]] = None, activation_layer: Optional[Callable[..., nn.Layer]] = None, **kwargs: Any, ) -> None: kernel_size = _triple(kernel_size) stride = _triple(stride) padding = _triple(padding) if norm_layer is None: norm_layer = Identity if activation_layer is None: activation_layer = Identity self.kernel_size = kernel_size self.stride = stride dict_layers = (nn.Conv3D(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, **kwargs), norm_layer(out_planes, momentum=0.1), activation_layer()) self.out_channels = out_planes super(Conv3DBNActivation, self).__init__(dict_layers[0], dict_layers[1], dict_layers[2]) class ConvBlock3D(CausalModule): def __init__( self, in_planes: int, out_planes: int, kernel_size: Union[int, Tuple[int, int, int]], causal: bool, conv_type: str, padding: Union[int, Tuple[int, int, int]] = 0, stride: Union[int, Tuple[int, int, int]] = 1, norm_layer: Optional[Callable[..., nn.Layer]] = None, activation_layer: Optional[Callable[..., nn.Layer]] = None, bias_attr: bool = False, **kwargs: Any, ) -> None: super().__init__() kernel_size = _triple(kernel_size) stride = _triple(stride) padding = _triple(padding) self.conv_2 = None if causal is True: padding = (0, padding[1], padding[2]) if conv_type != "2plus1d" and conv_type != "3d": raise ValueError("only 2plus2d or 3d are " + "allowed as 3d convolutions") if conv_type == "2plus1d": self.conv_1 = Conv2dBNActivation(in_planes, out_planes, kernel_size=(kernel_size[1], kernel_size[2]), padding=(padding[1], padding[2]), stride=(stride[1], stride[2]), activation_layer=activation_layer, norm_layer=norm_layer, bias_attr=bias_attr, **kwargs) if kernel_size[0] > 1: self.conv_2 = Conv2dBNActivation( in_planes, out_planes, kernel_size=(kernel_size[0], 1), padding=(padding[0], 0), stride=(stride[0], 1), activation_layer=activation_layer, norm_layer=norm_layer, bias_attr=bias_attr, **kwargs) elif conv_type == "3d": self.conv_1 = Conv3DBNActivation(in_planes, out_planes, kernel_size=kernel_size, padding=padding, activation_layer=activation_layer, norm_layer=norm_layer, stride=stride, bias_attr=bias_attr, **kwargs) self.padding = padding self.kernel_size = kernel_size self.dim_pad = self.kernel_size[0] - 1 self.stride = stride self.causal = causal self.conv_type = conv_type def _forward(self, x: paddle.Tensor) -> paddle.Tensor: if self.dim_pad > 0 and self.conv_2 is None and self.causal is True: x = self._cat_stream_buffer(x) b, c, t, h, w = x.shape if self.conv_type == "2plus1d": x = paddle.transpose(x, (0, 2, 1, 3, 4)) # bcthw --> btchw x = paddle.reshape_(x, (-1, c, h, w)) # btchw --> bt,c,h,w x = self.conv_1(x) if self.conv_type == "2plus1d": b, c, h, w = x.shape x = paddle.reshape_(x, (-1, t, c, h, w)) # bt,c,h,w --> b,t,c,h,w x = paddle.transpose(x, (0, 2, 1, 3, 4)) # b,t,c,h,w --> b,c,t,h,w if self.conv_2 is not None: if self.dim_pad > 0 and self.causal is True: x = self._cat_stream_buffer(x) b, c, t, h, w = x.shape x = paddle.reshape_(x, (b, c, t, h * w)) x = self.conv_2(x) b, c, t, _ = x.shape x = paddle.reshape_(x, (b, c, t, h, w)) return x def forward(self, x: paddle.Tensor) -> paddle.Tensor: x = self._forward(x) return x def _cat_stream_buffer(self, x: paddle.Tensor) -> paddle.Tensor: if self.activation is None: self._setup_activation(x.shape) x = paddle.concat((self.activation, x), 2) self._save_in_activation(x) return x def _save_in_activation(self, x: paddle.Tensor) -> None: assert self.dim_pad > 0 self.activation = paddle.to_tensor(x.numpy()[:, :, -self.dim_pad:, ...]).clone().detach() def _setup_activation(self, input_shape: Tuple[float, ...]) -> None: assert self.dim_pad > 0 self.activation = paddle.zeros(shape=[ *input_shape[:2], # type: ignore self.dim_pad, *input_shape[3:] ]) class TemporalCGAvgPool3D(CausalModule): def __init__(self, ) -> None: super().__init__() self.n_cumulated_values = 0 self.register_forward_post_hook(self._detach_activation) def forward(self, x: paddle.Tensor) -> paddle.Tensor: input_shape = x.shape cumulative_sum = paddle.cumsum(x, axis=2) if self.activation is None: self.activation = cumulative_sum[:, :, -1:].clone() else: cumulative_sum += self.activation self.activation = cumulative_sum[:, :, -1:].clone() noe = paddle.arange(1, input_shape[2] + 1) axis = paddle.to_tensor([0, 1, 3, 4]) noe = paddle.unsqueeze(noe, axis=axis) divisor = noe.expand(x.shape) x = cumulative_sum / (self.n_cumulated_values + divisor) self.n_cumulated_values += input_shape[2] return x @staticmethod def _detach_activation(module: CausalModule, inputs: paddle.Tensor, output: paddle.Tensor) -> None: module.activation.detach() def reset_activation(self) -> None: super().reset_activation() self.n_cumulated_values = 0 class SqueezeExcitation(nn.Layer): def __init__(self, input_channels: int, activation_2: nn.Layer, activation_1: nn.Layer, conv_type: str, causal: bool, squeeze_factor: int = 4, bias_attr: bool = True) -> None: super().__init__() self.causal = causal se_multiplier = 2 if causal else 1 squeeze_channels = _make_divisible( input_channels // squeeze_factor * se_multiplier, 8) self.temporal_cumualtive_GAvg3D = TemporalCGAvgPool3D() self.fc1 = ConvBlock3D(input_channels * se_multiplier, squeeze_channels, kernel_size=(1, 1, 1), padding=0, causal=causal, conv_type=conv_type, bias_attr=bias_attr) self.activation_1 = activation_1() self.activation_2 = activation_2() self.fc2 = ConvBlock3D(squeeze_channels, input_channels, kernel_size=(1, 1, 1), padding=0, causal=causal, conv_type=conv_type, bias_attr=bias_attr) def _scale(self, inputs: paddle.Tensor) -> paddle.Tensor: if self.causal: x_space = paddle.mean(inputs, axis=[3, 4], keepdim=True) scale = self.temporal_cumualtive_GAvg3D(x_space) scale = paddle.concat((scale, x_space), axis=1) else: scale = F.adaptive_avg_pool3d(inputs, 1) scale = self.fc1(scale) scale = self.activation_1(scale) scale = self.fc2(scale) return self.activation_2(scale) def forward(self, inputs: paddle.Tensor) -> paddle.Tensor: scale = self._scale(inputs) return scale * inputs class BasicBneck(nn.Layer): def __init__( self, input_channels, out_channels, expanded_channels, kernel_size, stride, padding, padding_avg, causal: bool, conv_type: str, norm_layer: Optional[Callable[..., nn.Layer]] = None, activation_layer: Optional[Callable[..., nn.Layer]] = None, ) -> None: super().__init__() assert type(stride) is tuple if (not stride[0] == 1 or not (1 <= stride[1] <= 2) or not (1 <= stride[2] <= 2)): raise ValueError('illegal stride value') self.res = None layers = [] if expanded_channels != out_channels: # expand self.expand = ConvBlock3D(in_planes=input_channels, out_planes=expanded_channels, kernel_size=(1, 1, 1), padding=(0, 0, 0), causal=causal, conv_type=conv_type, norm_layer=norm_layer, activation_layer=activation_layer) # deepwise self.deep = ConvBlock3D(in_planes=expanded_channels, out_planes=expanded_channels, kernel_size=kernel_size, padding=padding, stride=stride, groups=expanded_channels, causal=causal, conv_type=conv_type, norm_layer=norm_layer, activation_layer=activation_layer) # SE self.se = SqueezeExcitation( expanded_channels, causal=causal, activation_1=activation_layer, activation_2=(nn.Sigmoid if conv_type == "3d" else nn.Hardsigmoid), conv_type=conv_type) # project self.project = ConvBlock3D(expanded_channels, out_channels, kernel_size=(1, 1, 1), padding=(0, 0, 0), causal=causal, conv_type=conv_type, norm_layer=norm_layer, activation_layer=Identity) if not (stride == (1, 1, 1) and input_channels == out_channels): if stride != (1, 1, 1): layers.append( nn.AvgPool3D((1, 3, 3), stride=stride, padding=padding_avg)) layers.append( ConvBlock3D( in_planes=input_channels, out_planes=out_channels, kernel_size=(1, 1, 1), padding=(0, 0, 0), norm_layer=norm_layer, activation_layer=Identity, causal=causal, conv_type=conv_type, )) self.res = nn.Sequential(*layers) self.alpha = self.create_parameter(shape=[1], dtype="float32") def forward(self, inputs: paddle.Tensor) -> paddle.Tensor: if self.res is not None: residual = self.res(inputs) else: residual = inputs if self.expand is not None: x = self.expand(inputs) else: x = inputs x = self.deep(x) x = self.se(x) x = self.project(x) result = residual + self.alpha * x return result @BACKBONES.register() class MoViNet(nn.Layer): def __init__( self, model_type: str = 'A0', hidden_dim: int = 2048, causal: bool = True, num_classes: int = 400, conv_type: str = "3d", ) -> None: super().__init__() """ causal: causal mode num_classes: number of classes for classifcation conv_type: type of convolution either 3d or 2plus1d """ blocks_dic = OrderedDict() cfg = MODEL_CONFIG[model_type] norm_layer = nn.BatchNorm3D if conv_type == "3d" else nn.BatchNorm2D activation_layer = nn.Swish if conv_type == "3d" else nn.Hardswish # conv1 self.conv1 = ConvBlock3D(in_planes=cfg['conv1'][0], out_planes=cfg['conv1'][1], kernel_size=cfg['conv1'][2], stride=cfg['conv1'][3], padding=cfg['conv1'][4], causal=causal, conv_type=conv_type, norm_layer=norm_layer, activation_layer=activation_layer) # blocks for i in range(2, len(cfg['block_num']) + 1): for j in range(cfg['block_num'][i - 1]): blocks_dic[f'b{i}_l{j}'] = BasicBneck( cfg[f'b{i}_l{j}'][0], cfg[f'b{i}_l{j}'][1], cfg[f'b{i}_l{j}'][2], cfg[f'b{i}_l{j}'][3], cfg[f'b{i}_l{j}'][4], cfg[f'b{i}_l{j}'][5], cfg[f'b{i}_l{j}'][6], causal=causal, conv_type=conv_type, norm_layer=norm_layer, activation_layer=activation_layer) self.blocks = nn.Sequential(*(blocks_dic.values())) # conv7 self.conv7 = ConvBlock3D(in_planes=cfg['conv7'][0], out_planes=cfg['conv7'][1], kernel_size=cfg['conv7'][2], stride=cfg['conv7'][3], padding=cfg['conv7'][4], causal=causal, conv_type=conv_type, norm_layer=norm_layer, activation_layer=activation_layer) # pool self.classifier = nn.Sequential( # dense9 ConvBlock3D(in_planes=cfg['conv7'][1], out_planes=hidden_dim, kernel_size=(1, 1, 1), causal=causal, conv_type=conv_type, bias_attr=True), nn.Swish(), nn.Dropout(p=0.2), # dense10d ConvBlock3D(in_planes=hidden_dim, out_planes=num_classes, kernel_size=(1, 1, 1), causal=causal, conv_type=conv_type, bias_attr=True), ) if causal: self.cgap = TemporalCGAvgPool3D() self.apply(self._weight_init) self.causal = causal def avg(self, x: paddle.Tensor) -> paddle.Tensor: if self.causal: avg = F.adaptive_avg_pool3d(x, (x.shape[2], 1, 1)) avg = self.cgap(avg)[:, :, -1:] else: avg = F.adaptive_avg_pool3d(x, 1) return avg @staticmethod def _weight_init(m): if isinstance(m, nn.Conv3D): nn.initializer.KaimingNormal(m.weight) if m.bias is not None: nn.initializer.Constant(0.0)(m.bias) elif isinstance(m, (nn.BatchNorm3D, nn.BatchNorm2D, nn.GroupNorm)): nn.initializer.Constant(1.0)(m.weight) nn.initializer.Constant(0.0)(m.bias) elif isinstance(m, nn.Linear): nn.initializer.Normal(m.weight, 0, 0.01) nn.initializer.Constant(0.0)(m.bias) def forward(self, x: paddle.Tensor) -> paddle.Tensor: x = self.conv1(x) x = self.blocks(x) x = self.conv7(x) x = self.avg(x) x = self.classifier(x) x = x.flatten(1) return x @staticmethod def _clean_activation_buffers(m): if issubclass(type(m), CausalModule): m.reset_activation() def clean_activation_buffers(self) -> None: self.apply(self._clean_activation_buffers) if __name__ == '__main__': net = MoViNet(causal=False, conv_type='3d') paddle.summary(net, input_size=(1, 3, 8, 224, 224))