PaddleOcr_v4/ppocr/modeling/backbones/rec_hybridvit.py

# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This code is refer from:
https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer_hybrid.py
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from itertools import repeat
import collections
import math
from functools import partial

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppocr.modeling.backbones.rec_resnetv2 import (
    ResNetV2,
    StdConv2dSame,
    DropPath,
    get_padding,
)
from paddle.nn.initializer import (
    TruncatedNormal,
    Constant,
    Normal,
    KaimingUniform,
    XavierUniform,
)

normal_ = Normal(mean=0.0, std=1e-6)
zeros_ = Constant(value=0.0)
ones_ = Constant(value=1.0)
kaiming_normal_ = KaimingUniform(nonlinearity="relu")
trunc_normal_ = TruncatedNormal(std=0.02)
xavier_uniform_ = XavierUniform()


def _ntuple(n):
    def parse(x):
        if isinstance(x, collections.abc.Iterable):
            return x
        return tuple(repeat(x, n))

    return parse


to_1tuple = _ntuple(1)
to_2tuple = _ntuple(2)
to_3tuple = _ntuple(3)
to_4tuple = _ntuple(4)
to_ntuple = _ntuple


class Conv2dAlign(nn.Conv2D):
    """Conv2d with Weight Standardization. Used for BiT ResNet-V2 models.

    Paper: `Micro-Batch Training with Batch-Channel Normalization and Weight Standardization` -
        https://arxiv.org/abs/1903.10520v2
    """

    def __init__(
        self,
        in_channel,
        out_channels,
        kernel_size,
        stride=1,
        padding=0,
        dilation=1,
        groups=1,
        bias=True,
        eps=1e-6,
    ):

        super().__init__(
            in_channel,
            out_channels,
            kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
            bias_attr=bias,
            weight_attr=True,
        )
        self.eps = eps

    def forward(self, x):
        x = F.conv2d(
            x,
            self.weight,
            self.bias,
            self._stride,
            self._padding,
            self._dilation,
            self._groups,
        )
        return x


class HybridEmbed(nn.Layer):
    """CNN Feature Map Embedding
    Extract feature map from CNN, flatten, project to embedding dim.
    """

    def __init__(
        self,
        backbone,
        img_size=224,
        patch_size=1,
        feature_size=None,
        in_chans=3,
        embed_dim=768,
    ):
        super().__init__()
        assert isinstance(backbone, nn.Layer)
        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)
        self.img_size = img_size
        self.patch_size = patch_size
        self.backbone = backbone
        feature_dim = 1024
        feature_size = (42, 12)
        patch_size = (1, 1)
        assert (
            feature_size[0] % patch_size[0] == 0
            and feature_size[1] % patch_size[1] == 0
        )

        self.grid_size = (
            feature_size[0] // patch_size[0],
            feature_size[1] // patch_size[1],
        )
        self.num_patches = self.grid_size[0] * self.grid_size[1]
        self.proj = nn.Conv2D(
            feature_dim,
            embed_dim,
            kernel_size=patch_size,
            stride=patch_size,
            weight_attr=True,
            bias_attr=True,
        )

    def forward(self, x):

        x = self.backbone(x)
        if isinstance(x, (list, tuple)):
            x = x[-1]  # last feature if backbone outputs list/tuple of features
        x = self.proj(x).flatten(2).transpose([0, 2, 1])

        return x


class myLinear(nn.Linear):
    def __init__(self, in_channel, out_channels, weight_attr=True, bias_attr=True):
        super().__init__(
            in_channel, out_channels, weight_attr=weight_attr, bias_attr=bias_attr
        )

    def forward(self, x):
        return paddle.matmul(x, self.weight, transpose_y=True) + self.bias


class Attention(nn.Layer):
    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = head_dim**-0.5

        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = myLinear(dim, dim, weight_attr=True, bias_attr=True)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x):
        B, N, C = x.shape
        qkv = (
            self.qkv(x)
            .reshape([B, N, 3, self.num_heads, C // self.num_heads])
            .transpose([2, 0, 3, 1, 4])
        )
        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)

        attn = (q @ k.transpose([0, 1, 3, 2])) * self.scale

        attn = F.softmax(attn, axis=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose([0, 2, 1, 3]).reshape([B, N, C])

        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class Mlp(nn.Layer):
    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""

    def __init__(
        self,
        in_features,
        hidden_features=None,
        out_features=None,
        act_layer=nn.GELU,
        drop=0.0,
    ):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        drop_probs = to_2tuple(drop)

        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.drop1 = nn.Dropout(drop_probs[0])
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop2 = nn.Dropout(drop_probs[1])

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop1(x)
        x = self.fc2(x)
        x = self.drop2(x)
        return x


class Block(nn.Layer):

    def __init__(
        self,
        dim,
        num_heads,
        mlp_ratio=4.0,
        qkv_bias=False,
        drop=0.0,
        attn_drop=0.0,
        drop_path=0.0,
        act_layer=nn.GELU,
        norm_layer=nn.LayerNorm,
    ):
        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attn = Attention(
            dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            attn_drop=attn_drop,
            proj_drop=drop,
        )
        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(
            in_features=dim,
            hidden_features=mlp_hidden_dim,
            act_layer=act_layer,
            drop=drop,
        )

    def forward(self, x):

        x = x + self.drop_path(self.attn(self.norm1(x)))
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x


class HybridTransformer(nn.Layer):
    """Implementation of HybridTransformer.

    Args:
      x: input images with shape [N, 1, H, W]
      label: LaTeX-OCR labels with shape [N, L] , L is the max sequence length
      attention_mask: LaTeX-OCR attention mask with shape [N, L]  , L is the max sequence length

    Returns:
      The encoded features with shape [N, 1, H//16, W//16]
    """

    def __init__(
        self,
        backbone_layers=[2, 3, 7],
        input_channel=1,
        is_predict=False,
        is_export=False,
        img_size=(224, 224),
        patch_size=16,
        num_classes=1000,
        embed_dim=768,
        depth=12,
        num_heads=12,
        mlp_ratio=4.0,
        qkv_bias=True,
        representation_size=None,
        distilled=False,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.0,
        embed_layer=None,
        norm_layer=None,
        act_layer=None,
        weight_init="",
        **kwargs,
    ):
        super(HybridTransformer, self).__init__()
        self.num_classes = num_classes
        self.num_features = self.embed_dim = (
            embed_dim  # num_features for consistency with other models
        )
        self.num_tokens = 2 if distilled else 1
        norm_layer = norm_layer or partial(nn.LayerNorm, epsilon=1e-6)
        act_layer = act_layer or nn.GELU
        self.height, self.width = img_size
        self.patch_size = patch_size
        backbone = ResNetV2(
            layers=backbone_layers,
            num_classes=0,
            global_pool="",
            in_chans=input_channel,
            preact=False,
            stem_type="same",
            conv_layer=StdConv2dSame,
            is_export=is_export,
        )
        min_patch_size = 2 ** (len(backbone_layers) + 1)
        self.patch_embed = HybridEmbed(
            img_size=img_size,
            patch_size=patch_size // min_patch_size,
            in_chans=input_channel,
            embed_dim=embed_dim,
            backbone=backbone,
        )
        num_patches = self.patch_embed.num_patches

        self.cls_token = paddle.create_parameter([1, 1, embed_dim], dtype="float32")
        self.dist_token = (
            paddle.create_parameter(
                [1, 1, embed_dim],
                dtype="float32",
            )
            if distilled
            else None
        )
        self.pos_embed = paddle.create_parameter(
            [1, num_patches + self.num_tokens, embed_dim], dtype="float32"
        )
        self.pos_drop = nn.Dropout(p=drop_rate)
        zeros_(self.cls_token)
        if self.dist_token is not None:
            zeros_(self.dist_token)
        zeros_(self.pos_embed)

        dpr = [
            x.item() for x in paddle.linspace(0, drop_path_rate, depth)
        ]  # stochastic depth decay rule
        self.blocks = nn.Sequential(
            *[
                Block(
                    dim=embed_dim,
                    num_heads=num_heads,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=qkv_bias,
                    drop=drop_rate,
                    attn_drop=attn_drop_rate,
                    drop_path=dpr[i],
                    norm_layer=norm_layer,
                    act_layer=act_layer,
                )
                for i in range(depth)
            ]
        )
        self.norm = norm_layer(embed_dim)

        # Representation layer
        if representation_size and not distilled:
            self.num_features = representation_size
            self.pre_logits = nn.Sequential(
                ("fc", nn.Linear(embed_dim, representation_size)), ("act", nn.Tanh())
            )
        else:
            self.pre_logits = nn.Identity()

        # Classifier head(s)
        self.head = (
            nn.Linear(self.num_features, num_classes)
            if num_classes > 0
            else nn.Identity()
        )
        self.head_dist = None
        if distilled:
            self.head_dist = (
                nn.Linear(self.embed_dim, self.num_classes)
                if num_classes > 0
                else nn.Identity()
            )
        self.init_weights(weight_init)
        self.out_channels = embed_dim
        self.is_predict = is_predict
        self.is_export = is_export

    def init_weights(self, mode=""):
        assert mode in ("jax", "jax_nlhb", "nlhb", "")
        head_bias = -math.log(self.num_classes) if "nlhb" in mode else 0.0
        trunc_normal_(self.pos_embed)
        trunc_normal_(self.cls_token)
        self.apply(_init_vit_weights)

    def _init_weights(self, m):
        # this fn left here for compat with downstream users
        _init_vit_weights(m)

    def load_pretrained(self, checkpoint_path, prefix=""):
        raise NotImplementedError

    def no_weight_decay(self):
        return {"pos_embed", "cls_token", "dist_token"}

    def get_classifier(self):
        if self.dist_token is None:
            return self.head
        else:
            return self.head, self.head_dist

    def reset_classifier(self, num_classes, global_pool=""):
        self.num_classes = num_classes
        self.head = (
            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
        )
        if self.num_tokens == 2:
            self.head_dist = (
                nn.Linear(self.embed_dim, self.num_classes)
                if num_classes > 0
                else nn.Identity()
            )

    def forward_features(self, x):
        B, c, h, w = x.shape
        x = self.patch_embed(x)
        cls_tokens = self.cls_token.expand(
            [B, -1, -1]
        )  # stole cls_tokens impl from Phil Wang, thanks
        x = paddle.concat((cls_tokens, x), axis=1)
        h, w = h // self.patch_size, w // self.patch_size
        repeat_tensor = (
            paddle.arange(h) * (self.width // self.patch_size - w)
        ).reshape([-1, 1])
        repeat_tensor = paddle.repeat_interleave(
            repeat_tensor, paddle.to_tensor(w), axis=1
        ).reshape([-1])
        pos_emb_ind = repeat_tensor + paddle.arange(h * w)
        pos_emb_ind = paddle.concat(
            (paddle.zeros([1], dtype="int64"), pos_emb_ind + 1), axis=0
        ).cast(paddle.int64)
        x += self.pos_embed[:, pos_emb_ind]
        x = self.pos_drop(x)

        for blk in self.blocks:
            x = blk(x)

        x = self.norm(x)
        return x

    def forward(self, input_data):

        if self.training:
            x, label, attention_mask = input_data
        else:
            if isinstance(input_data, list):
                x = input_data[0]
            else:
                x = input_data
        x = self.forward_features(x)
        x = self.head(x)
        if self.training:
            return x, label, attention_mask
        else:
            return x


def _init_vit_weights(
    module: nn.Layer, name: str = "", head_bias: float = 0.0, jax_impl: bool = False
):
    """ViT weight initialization
    * When called without n, head_bias, jax_impl args it will behave exactly the same
      as my original init for compatibility with prev hparam / downstream use cases (ie DeiT).
    * When called w/ valid n (module name) and jax_impl=True, will (hopefully) match JAX impl
    """
    if isinstance(module, nn.Linear):
        if name.startswith("head"):
            zeros_(module.weight)
            constant_ = Constant(value=head_bias)
            constant_(module.bias, head_bias)
        elif name.startswith("pre_logits"):
            zeros_(module.bias)
        else:
            if jax_impl:
                xavier_uniform_(module.weight)
                if module.bias is not None:
                    if "mlp" in name:
                        normal_(module.bias)
                    else:
                        zeros_(module.bias)
            else:
                trunc_normal_(module.weight)
                if module.bias is not None:
                    zeros_(module.bias)
    elif jax_impl and isinstance(module, nn.Conv2D):
        # NOTE conv was left to pytorch default in my original init
        if module.bias is not None:
            zeros_(module.bias)
    elif isinstance(module, (nn.LayerNorm, nn.GroupNorm, nn.BatchNorm2D)):
        zeros_(module.bias)
        ones_(module.weight)
paddleocr 1 year ago			`# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`"""`
			`This code is refer from:`
			`https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer_hybrid.py`
			`"""`

			`from __future__ import absolute_import`
			`from __future__ import division`
			`from __future__ import print_function`

			`from itertools import repeat`
			`import collections`
			`import math`
			`from functools import partial`

			`import paddle`
			`import paddle.nn as nn`
			`import paddle.nn.functional as F`
			`from ppocr.modeling.backbones.rec_resnetv2 import (`
			`ResNetV2,`
			`StdConv2dSame,`
			`DropPath,`
			`get_padding,`
			`)`
			`from paddle.nn.initializer import (`
			`TruncatedNormal,`
			`Constant,`
			`Normal,`
			`KaimingUniform,`
			`XavierUniform,`
			`)`

			`normal_ = Normal(mean=0.0, std=1e-6)`
			`zeros_ = Constant(value=0.0)`
			`ones_ = Constant(value=1.0)`
			`kaiming_normal_ = KaimingUniform(nonlinearity="relu")`
			`trunc_normal_ = TruncatedNormal(std=0.02)`
			`xavier_uniform_ = XavierUniform()`


			`def _ntuple(n):`
			`def parse(x):`
			`if isinstance(x, collections.abc.Iterable):`
			`return x`
			`return tuple(repeat(x, n))`

			`return parse`


			`to_1tuple = _ntuple(1)`
			`to_2tuple = _ntuple(2)`
			`to_3tuple = _ntuple(3)`
			`to_4tuple = _ntuple(4)`
			`to_ntuple = _ntuple`


			`class Conv2dAlign(nn.Conv2D):`
			`"""Conv2d with Weight Standardization. Used for BiT ResNet-V2 models.`

			Paper: `Micro-Batch Training with Batch-Channel Normalization and Weight Standardization` -
			`https://arxiv.org/abs/1903.10520v2`
			`"""`

			`def __init__(`
			`self,`
			`in_channel,`
			`out_channels,`
			`kernel_size,`
			`stride=1,`
			`padding=0,`
			`dilation=1,`
			`groups=1,`
			`bias=True,`
			`eps=1e-6,`
			`):`

			`super().__init__(`
			`in_channel,`
			`out_channels,`
			`kernel_size,`
			`stride=stride,`
			`padding=padding,`
			`dilation=dilation,`
			`groups=groups,`
			`bias_attr=bias,`
			`weight_attr=True,`
			`)`
			`self.eps = eps`

			`def forward(self, x):`
			`x = F.conv2d(`
			`x,`
			`self.weight,`
			`self.bias,`
			`self._stride,`
			`self._padding,`
			`self._dilation,`
			`self._groups,`
			`)`
			`return x`


			`class HybridEmbed(nn.Layer):`
			`"""CNN Feature Map Embedding`
			`Extract feature map from CNN, flatten, project to embedding dim.`
			`"""`

			`def __init__(`
			`self,`
			`backbone,`
			`img_size=224,`
			`patch_size=1,`
			`feature_size=None,`
			`in_chans=3,`
			`embed_dim=768,`
			`):`
			`super().__init__()`
			`assert isinstance(backbone, nn.Layer)`
			`img_size = to_2tuple(img_size)`
			`patch_size = to_2tuple(patch_size)`
			`self.img_size = img_size`
			`self.patch_size = patch_size`
			`self.backbone = backbone`
			`feature_dim = 1024`
			`feature_size = (42, 12)`
			`patch_size = (1, 1)`
			`assert (`
			`feature_size[0] % patch_size[0] == 0`
			`and feature_size[1] % patch_size[1] == 0`
			`)`

			`self.grid_size = (`
			`feature_size[0] // patch_size[0],`
			`feature_size[1] // patch_size[1],`
			`)`
			`self.num_patches = self.grid_size[0] * self.grid_size[1]`
			`self.proj = nn.Conv2D(`
			`feature_dim,`
			`embed_dim,`
			`kernel_size=patch_size,`
			`stride=patch_size,`
			`weight_attr=True,`
			`bias_attr=True,`
			`)`

			`def forward(self, x):`

			`x = self.backbone(x)`
			`if isinstance(x, (list, tuple)):`
			`x = x[-1] # last feature if backbone outputs list/tuple of features`
			`x = self.proj(x).flatten(2).transpose([0, 2, 1])`

			`return x`


			`class myLinear(nn.Linear):`
			`def __init__(self, in_channel, out_channels, weight_attr=True, bias_attr=True):`
			`super().__init__(`
			`in_channel, out_channels, weight_attr=weight_attr, bias_attr=bias_attr`
			`)`

			`def forward(self, x):`
			`return paddle.matmul(x, self.weight, transpose_y=True) + self.bias`


			`class Attention(nn.Layer):`
			`def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0):`
			`super().__init__()`
			`self.num_heads = num_heads`
			`head_dim = dim // num_heads`
			`self.scale = head_dim**-0.5`

			`self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)`
			`self.attn_drop = nn.Dropout(attn_drop)`
			`self.proj = myLinear(dim, dim, weight_attr=True, bias_attr=True)`
			`self.proj_drop = nn.Dropout(proj_drop)`

			`def forward(self, x):`
			`B, N, C = x.shape`
			`qkv = (`
			`self.qkv(x)`
			`.reshape([B, N, 3, self.num_heads, C // self.num_heads])`
			`.transpose([2, 0, 3, 1, 4])`
			`)`
			`q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple)`

			`attn = (q @ k.transpose([0, 1, 3, 2])) * self.scale`

			`attn = F.softmax(attn, axis=-1)`
			`attn = self.attn_drop(attn)`

			`x = (attn @ v).transpose([0, 2, 1, 3]).reshape([B, N, C])`

			`x = self.proj(x)`
			`x = self.proj_drop(x)`
			`return x`


			`class Mlp(nn.Layer):`
			`"""MLP as used in Vision Transformer, MLP-Mixer and related networks"""`

			`def __init__(`
			`self,`
			`in_features,`
			`hidden_features=None,`
			`out_features=None,`
			`act_layer=nn.GELU,`
			`drop=0.0,`
			`):`
			`super().__init__()`
			`out_features = out_features or in_features`
			`hidden_features = hidden_features or in_features`
			`drop_probs = to_2tuple(drop)`

			`self.fc1 = nn.Linear(in_features, hidden_features)`
			`self.act = act_layer()`
			`self.drop1 = nn.Dropout(drop_probs[0])`
			`self.fc2 = nn.Linear(hidden_features, out_features)`
			`self.drop2 = nn.Dropout(drop_probs[1])`

			`def forward(self, x):`
			`x = self.fc1(x)`
			`x = self.act(x)`
			`x = self.drop1(x)`
			`x = self.fc2(x)`
			`x = self.drop2(x)`
			`return x`


			`class Block(nn.Layer):`

			`def __init__(`
			`self,`
			`dim,`
			`num_heads,`
			`mlp_ratio=4.0,`
			`qkv_bias=False,`
			`drop=0.0,`
			`attn_drop=0.0,`
			`drop_path=0.0,`
			`act_layer=nn.GELU,`
			`norm_layer=nn.LayerNorm,`
			`):`
			`super().__init__()`
			`self.norm1 = norm_layer(dim)`
			`self.attn = Attention(`
			`dim,`
			`num_heads=num_heads,`
			`qkv_bias=qkv_bias,`
			`attn_drop=attn_drop,`
			`proj_drop=drop,`
			`)`
			`# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here`
			`self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()`
			`self.norm2 = norm_layer(dim)`
			`mlp_hidden_dim = int(dim * mlp_ratio)`
			`self.mlp = Mlp(`
			`in_features=dim,`
			`hidden_features=mlp_hidden_dim,`
			`act_layer=act_layer,`
			`drop=drop,`
			`)`

			`def forward(self, x):`

			`x = x + self.drop_path(self.attn(self.norm1(x)))`
			`x = x + self.drop_path(self.mlp(self.norm2(x)))`
			`return x`


			`class HybridTransformer(nn.Layer):`
			`"""Implementation of HybridTransformer.`

			`Args:`
			`x: input images with shape [N, 1, H, W]`
			`label: LaTeX-OCR labels with shape [N, L] , L is the max sequence length`
			`attention_mask: LaTeX-OCR attention mask with shape [N, L] , L is the max sequence length`

			`Returns:`
			`The encoded features with shape [N, 1, H//16, W//16]`
			`"""`

			`def __init__(`
			`self,`
			`backbone_layers=[2, 3, 7],`
			`input_channel=1,`
			`is_predict=False,`
			`is_export=False,`
			`img_size=(224, 224),`
			`patch_size=16,`
			`num_classes=1000,`
			`embed_dim=768,`
			`depth=12,`
			`num_heads=12,`
			`mlp_ratio=4.0,`
			`qkv_bias=True,`
			`representation_size=None,`
			`distilled=False,`
			`drop_rate=0.0,`
			`attn_drop_rate=0.0,`
			`drop_path_rate=0.0,`
			`embed_layer=None,`
			`norm_layer=None,`
			`act_layer=None,`
			`weight_init="",`
			`**kwargs,`
			`):`
			`super(HybridTransformer, self).__init__()`
			`self.num_classes = num_classes`
			`self.num_features = self.embed_dim = (`
			`embed_dim # num_features for consistency with other models`
			`)`
			`self.num_tokens = 2 if distilled else 1`
			`norm_layer = norm_layer or partial(nn.LayerNorm, epsilon=1e-6)`
			`act_layer = act_layer or nn.GELU`
			`self.height, self.width = img_size`
			`self.patch_size = patch_size`
			`backbone = ResNetV2(`
			`layers=backbone_layers,`
			`num_classes=0,`
			`global_pool="",`
			`in_chans=input_channel,`
			`preact=False,`
			`stem_type="same",`
			`conv_layer=StdConv2dSame,`
			`is_export=is_export,`
			`)`
			`min_patch_size = 2 ** (len(backbone_layers) + 1)`
			`self.patch_embed = HybridEmbed(`
			`img_size=img_size,`
			`patch_size=patch_size // min_patch_size,`
			`in_chans=input_channel,`
			`embed_dim=embed_dim,`
			`backbone=backbone,`
			`)`
			`num_patches = self.patch_embed.num_patches`

			`self.cls_token = paddle.create_parameter([1, 1, embed_dim], dtype="float32")`
			`self.dist_token = (`
			`paddle.create_parameter(`
			`[1, 1, embed_dim],`
			`dtype="float32",`
			`)`
			`if distilled`
			`else None`
			`)`
			`self.pos_embed = paddle.create_parameter(`
			`[1, num_patches + self.num_tokens, embed_dim], dtype="float32"`
			`)`
			`self.pos_drop = nn.Dropout(p=drop_rate)`
			`zeros_(self.cls_token)`
			`if self.dist_token is not None:`
			`zeros_(self.dist_token)`
			`zeros_(self.pos_embed)`

			`dpr = [`
			`x.item() for x in paddle.linspace(0, drop_path_rate, depth)`
			`] # stochastic depth decay rule`
			`self.blocks = nn.Sequential(`
			`*[`
			`Block(`
			`dim=embed_dim,`
			`num_heads=num_heads,`
			`mlp_ratio=mlp_ratio,`
			`qkv_bias=qkv_bias,`
			`drop=drop_rate,`
			`attn_drop=attn_drop_rate,`
			`drop_path=dpr[i],`
			`norm_layer=norm_layer,`
			`act_layer=act_layer,`
			`)`
			`for i in range(depth)`
			`]`
			`)`
			`self.norm = norm_layer(embed_dim)`

			`# Representation layer`
			`if representation_size and not distilled:`
			`self.num_features = representation_size`
			`self.pre_logits = nn.Sequential(`
			`("fc", nn.Linear(embed_dim, representation_size)), ("act", nn.Tanh())`
			`)`
			`else:`
			`self.pre_logits = nn.Identity()`

			`# Classifier head(s)`
			`self.head = (`
			`nn.Linear(self.num_features, num_classes)`
			`if num_classes > 0`
			`else nn.Identity()`
			`)`
			`self.head_dist = None`
			`if distilled:`
			`self.head_dist = (`
			`nn.Linear(self.embed_dim, self.num_classes)`
			`if num_classes > 0`
			`else nn.Identity()`
			`)`
			`self.init_weights(weight_init)`
			`self.out_channels = embed_dim`
			`self.is_predict = is_predict`
			`self.is_export = is_export`

			`def init_weights(self, mode=""):`
			`assert mode in ("jax", "jax_nlhb", "nlhb", "")`
			`head_bias = -math.log(self.num_classes) if "nlhb" in mode else 0.0`
			`trunc_normal_(self.pos_embed)`
			`trunc_normal_(self.cls_token)`
			`self.apply(_init_vit_weights)`

			`def _init_weights(self, m):`
			`# this fn left here for compat with downstream users`
			`_init_vit_weights(m)`

			`def load_pretrained(self, checkpoint_path, prefix=""):`
			`raise NotImplementedError`

			`def no_weight_decay(self):`
			`return {"pos_embed", "cls_token", "dist_token"}`

			`def get_classifier(self):`
			`if self.dist_token is None:`
			`return self.head`
			`else:`
			`return self.head, self.head_dist`

			`def reset_classifier(self, num_classes, global_pool=""):`
			`self.num_classes = num_classes`
			`self.head = (`
			`nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()`
			`)`
			`if self.num_tokens == 2:`
			`self.head_dist = (`
			`nn.Linear(self.embed_dim, self.num_classes)`
			`if num_classes > 0`
			`else nn.Identity()`
			`)`

			`def forward_features(self, x):`
			`B, c, h, w = x.shape`
			`x = self.patch_embed(x)`
			`cls_tokens = self.cls_token.expand(`
			`[B, -1, -1]`
			`) # stole cls_tokens impl from Phil Wang, thanks`
			`x = paddle.concat((cls_tokens, x), axis=1)`
			`h, w = h // self.patch_size, w // self.patch_size`
			`repeat_tensor = (`
			`paddle.arange(h) * (self.width // self.patch_size - w)`
			`).reshape([-1, 1])`
			`repeat_tensor = paddle.repeat_interleave(`
			`repeat_tensor, paddle.to_tensor(w), axis=1`
			`).reshape([-1])`
			`pos_emb_ind = repeat_tensor + paddle.arange(h * w)`
			`pos_emb_ind = paddle.concat(`
			`(paddle.zeros([1], dtype="int64"), pos_emb_ind + 1), axis=0`
			`).cast(paddle.int64)`
			`x += self.pos_embed[:, pos_emb_ind]`
			`x = self.pos_drop(x)`

			`for blk in self.blocks:`
			`x = blk(x)`

			`x = self.norm(x)`
			`return x`

			`def forward(self, input_data):`

			`if self.training:`
			`x, label, attention_mask = input_data`
			`else:`
			`if isinstance(input_data, list):`
			`x = input_data[0]`
			`else:`
			`x = input_data`
			`x = self.forward_features(x)`
			`x = self.head(x)`
			`if self.training:`
			`return x, label, attention_mask`
			`else:`
			`return x`


			`def _init_vit_weights(`
			`module: nn.Layer, name: str = "", head_bias: float = 0.0, jax_impl: bool = False`
			`):`
			`"""ViT weight initialization`
			`* When called without n, head_bias, jax_impl args it will behave exactly the same`
			`as my original init for compatibility with prev hparam / downstream use cases (ie DeiT).`
			`* When called w/ valid n (module name) and jax_impl=True, will (hopefully) match JAX impl`
			`"""`
			`if isinstance(module, nn.Linear):`
			`if name.startswith("head"):`
			`zeros_(module.weight)`
			`constant_ = Constant(value=head_bias)`
			`constant_(module.bias, head_bias)`
			`elif name.startswith("pre_logits"):`
			`zeros_(module.bias)`
			`else:`
			`if jax_impl:`
			`xavier_uniform_(module.weight)`
			`if module.bias is not None:`
			`if "mlp" in name:`
			`normal_(module.bias)`
			`else:`
			`zeros_(module.bias)`
			`else:`
			`trunc_normal_(module.weight)`
			`if module.bias is not None:`
			`zeros_(module.bias)`
			`elif jax_impl and isinstance(module, nn.Conv2D):`
			`# NOTE conv was left to pytorch default in my original init`
			`if module.bias is not None:`
			`zeros_(module.bias)`
			`elif isinstance(module, (nn.LayerNorm, nn.GroupNorm, nn.BatchNorm2D)):`
			`zeros_(module.bias)`
			`ones_(module.weight)`