# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License" # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math from collections import OrderedDict import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle.nn import BatchNorm2D, Conv2D from paddle.nn.initializer import Constant, Normal from paddle.vision.models import ResNet from ...utils import load_ckpt from ..registry import BACKBONES from ..weight_init import kaiming_normal_, _calculate_fan_in_and_fan_out zeros_ = Constant(value=0.) ones_ = Constant(value=1.) normal_ = Normal(mean=0, std=1e-3) def disp_to_depth(disp, min_depth, max_depth): """Convert network's sigmoid output into depth prediction The formula for this conversion is given in the 'additional considerations' section of the paper. """ min_disp = 1 / max_depth max_disp = 1 / min_depth scaled_disp = min_disp + (max_disp - min_disp) * disp depth = 1 / scaled_disp return scaled_disp, depth def gram_matrix(y): (b, ch, h, w) = y.shape features = y.reshape([b, ch, w * h]) features_t = paddle.transpose(features, [0, 2, 1]) gram = features.bmm(features_t) / (ch * h * w) return gram def convt_bn_relu(in_channels, out_channels, kernel_size, stride=1, padding=0, output_padding=0, bn=True, relu=True): bias = not bn layers = [] layers.append( nn.Conv2DTranspose(in_channels, out_channels, kernel_size, stride, padding, output_padding, bias_attr=bias)) if bn: layers.append(nn.BatchNorm2D(out_channels)) if relu: layers.append(nn.LeakyReLU(0.2)) layers = nn.Sequential(*layers) # initialize the weights for m in layers.sublayers(include_self=True): if isinstance(m, nn.Conv2DTranspose): normal_(m.weight) if m.bias is not None: zeros_(m.bias) elif isinstance(m, nn.BatchNorm2D): ones_(m.weight) zeros_(m.bias) return layers def transformation_from_parameters(axisangle, translation, invert=False): """Convert the network's (axisangle, translation) output into a 4x4 matrix """ R = rot_from_axisangle(axisangle) t = translation.clone() if invert: R = R.transpose([0, 2, 1]) t *= -1 T = get_translation_matrix(t) if invert: M = paddle.matmul(R, T) else: M = paddle.matmul(T, R) return M def get_translation_matrix(translation_vector): """Convert a translation vector into a 4x4 transformation matrix """ t = translation_vector.reshape([-1, 3, 1]) gather_object = paddle.stack([ paddle.zeros([ translation_vector.shape[0], ], paddle.float32), paddle.ones([ translation_vector.shape[0], ], paddle.float32), paddle.squeeze(t[:, 0], axis=-1), paddle.squeeze(t[:, 1], axis=-1), paddle.squeeze(t[:, 2], axis=-1), ]) gather_index = paddle.to_tensor([ [1], [0], [0], [2], [0], [1], [0], [3], [0], [0], [1], [4], [0], [0], [0], [1], ]) T = paddle.gather_nd(gather_object, gather_index) T = T.reshape([4, 4, -1]).transpose((2, 0, 1)) return T def rot_from_axisangle(vec): """Convert an axisangle rotation into a 4x4 transformation matrix (adapted from https://github.com/Wallacoloo/printipi) Input 'vec' has to be Bx1x3 """ angle = paddle.norm(vec, 2, 2, True) axis = vec / (angle + 1e-7) ca = paddle.cos(angle) sa = paddle.sin(angle) C = 1 - ca x = axis[..., 0].unsqueeze(1) y = axis[..., 1].unsqueeze(1) z = axis[..., 2].unsqueeze(1) xs = x * sa ys = y * sa zs = z * sa xC = x * C yC = y * C zC = z * C xyC = x * yC yzC = y * zC zxC = z * xC gather_object = paddle.stack([ paddle.squeeze(x * xC + ca, axis=(-1, -2)), paddle.squeeze(xyC - zs, axis=(-1, -2)), paddle.squeeze(zxC + ys, axis=(-1, -2)), paddle.squeeze(xyC + zs, axis=(-1, -2)), paddle.squeeze(y * yC + ca, axis=(-1, -2)), paddle.squeeze(yzC - xs, axis=(-1, -2)), paddle.squeeze(zxC - ys, axis=(-1, -2)), paddle.squeeze(yzC + xs, axis=(-1, -2)), paddle.squeeze(z * zC + ca, axis=(-1, -2)), paddle.ones([ vec.shape[0], ], dtype=paddle.float32), paddle.zeros([ vec.shape[0], ], dtype=paddle.float32) ]) gather_index = paddle.to_tensor([ [0], [1], [2], [10], [3], [4], [5], [10], [6], [7], [8], [10], [10], [10], [10], [9], ]) rot = paddle.gather_nd(gather_object, gather_index) rot = rot.reshape([4, 4, -1]).transpose((2, 0, 1)) return rot def upsample(x): """Upsample input tensor by a factor of 2 """ return F.interpolate(x, scale_factor=2, mode="nearest") def get_smooth_loss(disp, img): """Computes the smoothness loss for a disparity image The color image is used for edge-aware smoothness """ grad_disp_x = paddle.abs(disp[:, :, :, :-1] - disp[:, :, :, 1:]) grad_disp_y = paddle.abs(disp[:, :, :-1, :] - disp[:, :, 1:, :]) grad_img_x = paddle.mean(paddle.abs(img[:, :, :, :-1] - img[:, :, :, 1:]), 1, keepdim=True) grad_img_y = paddle.mean(paddle.abs(img[:, :, :-1, :] - img[:, :, 1:, :]), 1, keepdim=True) grad_disp_x *= paddle.exp(-grad_img_x) grad_disp_y *= paddle.exp(-grad_img_y) return grad_disp_x.mean() + grad_disp_y.mean() def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): """3x3 convolution with padding""" return nn.Conv2D(in_planes, out_planes, kernel_size=3, stride=stride, padding=dilation, groups=groups, bias_attr=False, dilation=dilation) def conv1x1(in_planes, out_planes, stride=1): """1x1 convolution""" return nn.Conv2D(in_planes, out_planes, kernel_size=1, stride=stride, bias_attr=False) def resnet_multiimage_input(num_layers, num_input_images=1): """Constructs a ResNet model. Args: num_layers (int): Number of resnet layers. Must be 18 or 50 pretrained (bool): If True, returns a model pre-trained on ImageNet num_input_images (int): Number of frames stacked as input """ assert num_layers in [18, 50], "Can only run with 18 or 50 layer resnet" blocks = {18: [2, 2, 2, 2], 50: [3, 4, 6, 3]}[num_layers] block_type = {18: BasicBlock, 50: Bottleneck}[num_layers] model = ResNetMultiImageInput(block_type, num_layers, blocks, num_input_images=num_input_images) model.init_weights() return model class ConvBlock(nn.Layer): """Layer to perform a convolution followed by ELU """ def __init__(self, in_channels, out_channels): super(ConvBlock, self).__init__() self.conv = Conv3x3(in_channels, out_channels) self.nonlin = nn.ELU() def forward(self, x): out = self.conv(x) out = self.nonlin(out) return out class Conv3x3(nn.Layer): """Layer to pad and convolve input """ def __init__(self, in_channels, out_channels, use_refl=True): super(Conv3x3, self).__init__() if use_refl: self.pad = nn.Pad2D(1, mode='reflect') else: self.pad = nn.Pad2D(1) self.conv = nn.Conv2D(int(in_channels), int(out_channels), 3) def forward(self, x): out = self.pad(x) out = self.conv(out) return out class BackprojectDepth(nn.Layer): """Layer to transform a depth image into a point cloud """ def __init__(self, batch_size, height, width): super(BackprojectDepth, self).__init__() self.batch_size = batch_size self.height = height self.width = width meshgrid = np.meshgrid(range(self.width), range(self.height), indexing='xy') id_coords = np.stack(meshgrid, axis=0).astype(np.float32) self.id_coords = self.create_parameter(shape=list(id_coords.shape), dtype=paddle.float32) self.id_coords.set_value(id_coords) self.add_parameter("id_coords", self.id_coords) self.id_coords.stop_gradient = True self.ones = self.create_parameter( shape=[self.batch_size, 1, self.height * self.width], default_initializer=ones_) self.add_parameter("ones", self.ones) self.ones.stop_gradient = True pix_coords = paddle.unsqueeze( paddle.stack([ self.id_coords[0].reshape([ -1, ]), self.id_coords[1].reshape([ -1, ]) ], 0), 0) pix_coords = pix_coords.tile([batch_size, 1, 1]) pix_coords = paddle.concat([pix_coords, self.ones], 1) self.pix_coords = self.create_parameter(shape=list(pix_coords.shape), ) self.pix_coords.set_value(pix_coords) self.add_parameter("pix_coords", self.pix_coords) self.pix_coords.stop_gradient = True def forward(self, depth, inv_K): cam_points = paddle.matmul(inv_K[:, :3, :3], self.pix_coords) cam_points = depth.reshape([self.batch_size, 1, -1]) * cam_points cam_points = paddle.concat([cam_points, self.ones], 1) return cam_points class Project3D(nn.Layer): """Layer which projects 3D points into a camera with intrinsics K and at position T """ def __init__(self, batch_size, height, width, eps=1e-7): super(Project3D, self).__init__() self.batch_size = batch_size self.height = height self.width = width self.eps = eps def forward(self, points, K, T): P = paddle.matmul(K, T)[:, :3, :] cam_points = paddle.matmul(P, points) pix_coords = cam_points[:, :2, :] / (cam_points[:, 2, :].unsqueeze(1) + self.eps) pix_coords = pix_coords.reshape( [self.batch_size, 2, self.height, self.width]) pix_coords = pix_coords.transpose([0, 2, 3, 1]) pix_coords[..., 0] /= self.width - 1 pix_coords[..., 1] /= self.height - 1 pix_coords = (pix_coords - 0.5) * 2 return pix_coords class SSIM(nn.Layer): """Layer to compute the SSIM loss between a pair of images """ def __init__(self): super(SSIM, self).__init__() self.mu_x_pool = nn.AvgPool2D(3, 1, exclusive=False) self.mu_y_pool = nn.AvgPool2D(3, 1, exclusive=False) self.sig_x_pool = nn.AvgPool2D(3, 1, exclusive=False) self.sig_y_pool = nn.AvgPool2D(3, 1, exclusive=False) self.sig_xy_pool = nn.AvgPool2D(3, 1, exclusive=False) self.refl = nn.Pad2D(1, mode='reflect') self.C1 = 0.01**2 self.C2 = 0.03**2 def forward(self, x, y): x = self.refl(x) y = self.refl(y) mu_x = self.mu_x_pool(x) mu_y = self.mu_y_pool(y) sigma_x = self.sig_x_pool(x**2) - mu_x**2 sigma_y = self.sig_y_pool(y**2) - mu_y**2 sigma_xy = self.sig_xy_pool(x * y) - mu_x * mu_y SSIM_n = (2 * mu_x * mu_y + self.C1) * (2 * sigma_xy + self.C2) SSIM_d = (mu_x**2 + mu_y**2 + self.C1) * (sigma_x + sigma_y + self.C2) return paddle.clip((1 - SSIM_n / SSIM_d) / 2, 0, 1) class ResNetMultiImageInput(ResNet): """Constructs a resnet model with varying number of input images. Adapted from https://github.com/pypaddle/vision/blob/master/paddlevision/models/resnet.py """ def __init__(self, block, depth, layers, num_input_images=1): super(ResNetMultiImageInput, self).__init__(block, depth) self.inplanes = 64 self.conv1 = nn.Conv2D(num_input_images * 3, 64, kernel_size=7, stride=2, padding=3, bias_attr=False) self.bn1 = nn.BatchNorm2D(64) self.relu = nn.ReLU() self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) def init_weights(self): for layer in self.sublayers(include_self=True): if isinstance(layer, nn.Conv2D): kaiming_normal_(layer.weight, mode='fan_out', nonlinearity='relu') elif isinstance(layer, nn.BatchNorm2D): ones_(layer.weight) zeros_(layer.bias) class ConvBNLayer(nn.Layer): """Conv2D and BatchNorm2D layer. Args: in_channels (int): Number of channels for the input. out_channels (int): Number of channels for the output. kernel_size (int): Kernel size. stride (int): Stride in the Conv2D layer. Default: 1. groups (int): Groups in the Conv2D, Default: 1. act (str): Indicate activation after BatchNorm2D layer. name (str): the name of an instance of ConvBNLayer. Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method. """ def __init__(self, in_channels, out_channels, kernel_size, stride=1, groups=1, act=None, name=None): super(ConvBNLayer, self).__init__() self._conv = Conv2D(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=(kernel_size - 1) // 2, groups=groups, bias_attr=False) self._act = act self._batch_norm = BatchNorm2D(out_channels) def forward(self, inputs): y = self._conv(inputs) y = self._batch_norm(y) if self._act: y = getattr(paddle.nn.functional, self._act)(y) return y class BasicBlock(nn.Layer): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, base_width=64, dilation=1, norm_layer=None): super(BasicBlock, self).__init__() if norm_layer is None: norm_layer = nn.BatchNorm2D if groups != 1 or base_width != 64: raise ValueError( 'BasicBlock only supports groups=1 and base_width=64') if dilation > 1: raise NotImplementedError( "Dilation > 1 not supported in BasicBlock") # Both self.conv1 and self.downsample layers downsample the input when stride != 1 self.conv1 = conv3x3(inplanes, planes, stride) self.bn1 = norm_layer(planes) self.relu = nn.ReLU() self.conv2 = conv3x3(planes, planes) self.bn2 = norm_layer(planes) self.downsample = downsample self.stride = stride def forward(self, x): identity = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = self.relu(out) return out class Bottleneck(nn.Layer): # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2) # while original implementation places the stride at the first 1x1 convolution(self.conv1) # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385. # This variant is also known as ResNet V1.5 and improves accuracy according to # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch. expansion = 4 def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, base_width=64, dilation=1, norm_layer=None): super(Bottleneck, self).__init__() if norm_layer is None: norm_layer = nn.BatchNorm2D width = int(planes * (base_width / 64.)) * groups self.conv1 = conv1x1(inplanes, width) self.bn1 = norm_layer(width) self.conv2 = conv3x3(width, width, stride, groups, dilation) self.bn2 = norm_layer(width) self.conv3 = conv1x1(width, planes * self.expansion) self.bn3 = norm_layer(planes * self.expansion) self.relu = nn.ReLU() self.downsample = downsample self.stride = stride def forward(self, x): identity = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = self.relu(out) return out class DepthDecoder(nn.Layer): def __init__(self, num_ch_enc, scales=range(4), num_output_channels=1, use_skips=True): super(DepthDecoder, self).__init__() self.num_output_channels = num_output_channels self.use_skips = use_skips self.upsample_mode = 'nearest' self.scales = scales self.num_ch_enc = num_ch_enc self.num_ch_dec = np.array([16, 32, 64, 128, 256]) # decoder self.convs = OrderedDict() for i in range(4, -1, -1): # upconv_0 num_ch_in = self.num_ch_enc[-1] if i == 4 else self.num_ch_dec[i + 1] num_ch_out = self.num_ch_dec[i] self.convs[("upconv", i, 0)] = ConvBlock(num_ch_in, num_ch_out) # upconv_1 num_ch_in = self.num_ch_dec[i] if self.use_skips and i > 0: num_ch_in += self.num_ch_enc[i - 1] num_ch_out = self.num_ch_dec[i] self.convs[("upconv", i, 1)] = ConvBlock(num_ch_in, num_ch_out) for s in self.scales: self.convs[("dispconv", s)] = Conv3x3(self.num_ch_dec[s], self.num_output_channels) self.decoder = nn.LayerList(list(self.convs.values())) self.sigmoid = nn.Sigmoid() def forward(self, input_features): outputs = {} # decoder x = input_features[-1] for i in range(4, -1, -1): x = self.convs[("upconv", i, 0)](x) x = [upsample(x)] if self.use_skips and i > 0: x += [input_features[i - 1]] x = paddle.concat(x, 1) x = self.convs[("upconv", i, 1)](x) if i in self.scales: outputs[("disp", i)] = self.sigmoid(self.convs[("dispconv", i)](x)) return outputs class PoseDecoder(nn.Layer): def __init__(self, num_ch_enc, num_input_features, num_frames_to_predict_for=None, stride=1): super(PoseDecoder, self).__init__() self.num_ch_enc = num_ch_enc self.num_input_features = num_input_features if num_frames_to_predict_for is None: num_frames_to_predict_for = num_input_features - 1 self.num_frames_to_predict_for = num_frames_to_predict_for self.convs = OrderedDict() self.convs[("squeeze")] = nn.Conv2D(self.num_ch_enc[-1], 256, 1) self.convs[("pose", 0)] = nn.Conv2D(num_input_features * 256, 256, 3, stride, 1) self.convs[("pose", 1)] = nn.Conv2D(256, 256, 3, stride, 1) self.convs[("pose", 2)] = nn.Conv2D(256, 6 * num_frames_to_predict_for, 1) self.relu = nn.ReLU() self.net = nn.LayerList(list(self.convs.values())) def forward(self, input_features): last_features = [f[-1] for f in input_features] cat_features = [ self.relu(self.convs["squeeze"](f)) for f in last_features ] cat_features = paddle.concat(cat_features, 1) out = cat_features for i in range(3): out = self.convs[("pose", i)](out) if i != 2: out = self.relu(out) out = out.mean(3).mean(2) out = 0.01 * out.reshape([-1, self.num_frames_to_predict_for, 1, 6]) axisangle = out[..., :3] translation = out[..., 3:] return axisangle, translation class ResnetEncoder(nn.Layer): """Pypaddle module for a resnet encoder """ def __init__(self, num_layers, pretrained=False, num_input_images=1): super(ResnetEncoder, self).__init__() self.num_ch_enc = np.array([64, 64, 128, 256, 512]) resnets = { 18: paddle.vision.models.resnet18, 34: paddle.vision.models.resnet34, 50: paddle.vision.models.resnet50, 101: paddle.vision.models.resnet101, 152: paddle.vision.models.resnet152 } if num_layers not in resnets: raise ValueError( "{} is not a valid number of resnet layers".format(num_layers)) if num_input_images > 1: self.encoder = resnet_multiimage_input(num_layers, pretrained, num_input_images) else: self.encoder = resnets[num_layers](pretrained) if num_layers > 34: self.num_ch_enc[1:] *= 4 ###################################### # night public first conv ###################################### self.conv1 = nn.Conv2D(3, 64, kernel_size=7, stride=2, padding=3, bias_attr=False) self.bn1 = nn.BatchNorm2D(64) self.relu = nn.ReLU() # NOTE self.conv_shared = nn.Conv2D(512, 64, kernel_size=1) ########################################## # private source encoder, day ########################################## self.encoder_day = resnets[num_layers](pretrained) self.conv_diff_day = nn.Conv2D( 512, 64, kernel_size=1) # no bn after conv, so bias=true ########################################## # private target encoder, night ########################################## self.encoder_night = resnets[num_layers](pretrained) self.conv_diff_night = nn.Conv2D(512, 64, kernel_size=1) ###################################### # shared decoder (small decoder), use a simple de-conv to upsample the features with no skip connection ###################################### self.convt5 = convt_bn_relu(in_channels=512, out_channels=256, kernel_size=3, stride=2, padding=1, output_padding=1) self.convt4 = convt_bn_relu(in_channels=256, out_channels=128, kernel_size=3, stride=2, padding=1, output_padding=1) self.convt3 = convt_bn_relu(in_channels=128, out_channels=64, kernel_size=3, stride=2, padding=1, output_padding=1) self.convt2 = convt_bn_relu(in_channels=64, out_channels=64, kernel_size=3, stride=2, padding=1, output_padding=1) self.convt1 = convt_bn_relu(in_channels=64, out_channels=64, kernel_size=3, stride=2, padding=1, output_padding=1) self.convtf = nn.Conv2D(64, 3, kernel_size=1, stride=1, padding=0) def forward(self, input_image, is_night): if self.training: result = [] input_data = (input_image - 0.45) / 0.225 if is_night == 'day': # source private encoder, day private_feature = self.encoder_day.conv1(input_data) private_feature = self.encoder_day.bn1(private_feature) private_feature = self.encoder_day.relu(private_feature) private_feature = self.encoder_day.maxpool(private_feature) private_feature = self.encoder_day.layer1(private_feature) private_feature = self.encoder_day.layer2(private_feature) private_feature = self.encoder_day.layer3(private_feature) private_feature = self.encoder_day.layer4(private_feature) private_code = self.conv_diff_day(private_feature) private_gram = gram_matrix(private_feature) result.append(private_code) result.append(private_gram) elif is_night == 'night': # target private encoder, night private_feature = self.encoder_night.conv1(input_data) private_feature = self.encoder_night.bn1(private_feature) private_feature = self.encoder_night.relu(private_feature) private_feature = self.encoder_night.maxpool(private_feature) private_feature = self.encoder_night.layer1(private_feature) private_feature = self.encoder_night.layer2(private_feature) private_feature = self.encoder_night.layer3(private_feature) private_feature = self.encoder_night.layer4(private_feature) private_code = self.conv_diff_night(private_feature) private_gram = gram_matrix(private_feature) result.append(private_code) result.append(private_gram) # shared encoder self.features = [] x = (input_image - 0.45) / 0.225 if is_night == 'day': x = self.encoder.conv1(x) x = self.encoder.bn1(x) self.features.append(self.encoder.relu(x)) else: x = self.conv1(x) x = self.bn1(x) self.features.append(self.relu(x)) self.features.append( self.encoder.layer1(self.encoder.maxpool(self.features[-1]))) self.features.append(self.encoder.layer2(self.features[-1])) self.features.append(self.encoder.layer3(self.features[-1])) self.features.append(self.encoder.layer4(self.features[-1])) if self.training: shared_code = self.conv_shared(self.features[-1]) shared_gram = gram_matrix(self.features[-1]) result.append(shared_code) # use this to calculate loss of diff result.append(shared_gram) result.append( self.features[-1]) # use this to calculate loss of similarity union_code = private_feature + self.features[-1] rec_code = self.convt5(union_code) rec_code = self.convt4(rec_code) rec_code = self.convt3(rec_code) rec_code = self.convt2(rec_code) rec_code = self.convt1(rec_code) rec_code = self.convtf(rec_code) result.append(rec_code) return self.features, result else: return self.features class ResnetEncoder_pose(nn.Layer): """Pypaddle module for a resnet encoder """ def __init__(self, num_layers, pretrained=False, num_input_images=1): super(ResnetEncoder_pose, self).__init__() self.num_ch_enc = np.array([64, 64, 128, 256, 512]) resnets = { 18: paddle.vision.models.resnet18, 34: paddle.vision.models.resnet34, 50: paddle.vision.models.resnet50, 101: paddle.vision.models.resnet101, 152: paddle.vision.models.resnet152 } if num_layers not in resnets: raise ValueError( "{} is not a valid number of resnet layers".format(num_layers)) if num_input_images > 1: self.encoder = resnet_multiimage_input(num_layers, num_input_images) else: self.encoder = resnets[num_layers](pretrained) if num_layers > 34: self.num_ch_enc[1:] *= 4 def forward(self, input_image): features = [] x = (input_image - 0.45) / 0.225 x = self.encoder.conv1(x) x = self.encoder.bn1(x) features.append(self.encoder.relu(x)) features.append(self.encoder.layer1(self.encoder.maxpool(features[-1]))) features.append(self.encoder.layer2(features[-1])) features.append(self.encoder.layer3(features[-1])) features.append(self.encoder.layer4(features[-1])) return features @BACKBONES.register() class ADDS_DepthNet(nn.Layer): def __init__(self, num_layers=18, frame_ids=[0, -1, 1], height=256, width=512, batch_size=6, pose_model_input="pairs", use_stereo=False, only_depth_encoder=False, pretrained=None, scales=[0, 1, 2, 3], min_depth=0.1, max_depth=100.0, pose_model_type='separate_resnet', v1_multiscale=False, predictive_mask=False, disable_automasking=False): super(ADDS_DepthNet, self).__init__() self.num_layers = num_layers self.height = height self.width = width self.batch_size = batch_size self.frame_ids = frame_ids self.pose_model_input = pose_model_input self.use_stereo = use_stereo self.only_depth_encoder = only_depth_encoder self.pretrained = pretrained self.scales = scales self.pose_model_type = pose_model_type self.predictive_mask = predictive_mask self.disable_automasking = disable_automasking self.v1_multiscale = v1_multiscale self.min_depth = min_depth self.max_depth = max_depth self.num_input_frames = len(self.frame_ids) self.num_pose_frames = 2 if self.pose_model_input == "pairs" else self.num_input_frames assert self.frame_ids[0] == 0, "frame_ids must start with 0" self.use_pose_net = not (self.use_stereo and self.frame_ids == [0]) self.encoder = ResnetEncoder(self.num_layers) if not self.only_depth_encoder: self.depth = DepthDecoder(self.encoder.num_ch_enc, self.scales) if self.use_pose_net and not self.only_depth_encoder: if self.pose_model_type == "separate_resnet": self.pose_encoder = ResnetEncoder_pose( self.num_layers, num_input_images=self.num_pose_frames) self.pose = PoseDecoder(self.pose_encoder.num_ch_enc, num_input_features=1, num_frames_to_predict_for=2) self.backproject_depth = {} self.project_3d = {} for scale in self.scales: h = self.height // (2**scale) w = self.width // (2**scale) self.backproject_depth[scale] = BackprojectDepth( self.batch_size, h, w) self.project_3d[scale] = Project3D(batch_size, h, w) def init_weights(self): """First init model's weight""" for m in self.sublayers(include_self=True): if isinstance(m, nn.Conv2D): kaiming_normal_(m.weight, a=math.sqrt(5)) if m.bias is not None: fan_in, _ = _calculate_fan_in_and_fan_out(m.weight) bound = 1 / math.sqrt(fan_in) uniform_ = paddle.nn.initializer.Uniform(-bound, bound) uniform_(m.bias) """Second, if provide pretrained ckpt, load it""" if self.pretrained: # load pretrained weights load_ckpt(self, self.pretrained) def forward(self, inputs, day_or_night='day'): if self.training: features, result = self.encoder(inputs["color_aug", 0, 0], 'day') features_night, result_night = self.encoder( inputs[("color_n_aug", 0, 0)], 'night') outputs = self.depth(features) outputs_night = self.depth(features_night) if self.use_pose_net and not self.only_depth_encoder: outputs.update(self.predict_poses(inputs, 'day')) outputs_night.update(self.predict_poses(inputs, 'night')) self.generate_images_pred(inputs, outputs, 'day') self.generate_images_pred(inputs, outputs_night, 'night') outputs['frame_ids'] = self.frame_ids outputs['scales'] = self.scales outputs['result'] = result outputs['result_night'] = result_night outputs_night['frame_ids'] = self.frame_ids outputs_night['scales'] = self.scales outputs['outputs_night'] = outputs_night else: if isinstance(inputs, dict): input_color = inputs[("color", 0, 0)] features = self.encoder(input_color, day_or_night[0]) outputs = self.depth(features) pred_disp, _ = disp_to_depth(outputs[("disp", 0)], self.min_depth, self.max_depth) pred_disp = pred_disp[:, 0].numpy() outputs['pred_disp'] = np.squeeze(pred_disp) outputs['gt'] = np.squeeze(inputs['depth_gt'].numpy()) else: input_color = inputs features = self.encoder(input_color, day_or_night) outputs = self.depth(features) pred_disp, _ = disp_to_depth(outputs[("disp", 0)], self.min_depth, self.max_depth) pred_disp = pred_disp[:, 0] outputs = paddle.squeeze(pred_disp) return outputs def predict_poses(self, inputs, is_night): """Predict poses between input frames for monocular sequences. """ outputs = {} if self.num_pose_frames == 2: if is_night: pose_feats = { f_i: inputs["color_n_aug", f_i, 0] for f_i in self.frame_ids } else: pose_feats = { f_i: inputs["color_aug", f_i, 0] for f_i in self.frame_ids } for f_i in self.frame_ids[1:]: if f_i != "s": if f_i < 0: pose_inputs = [pose_feats[f_i], pose_feats[0]] else: pose_inputs = [pose_feats[0], pose_feats[f_i]] if self.pose_model_type == "separate_resnet": pose_inputs = [ self.pose_encoder(paddle.concat(pose_inputs, axis=1)) ] axisangle, translation = self.pose(pose_inputs) outputs[("axisangle", 0, f_i)] = axisangle outputs[("translation", 0, f_i)] = translation # Invert the matrix if the frame id is negative outputs[("cam_T_cam", 0, f_i)] = transformation_from_parameters( axisangle[:, 0], translation[:, 0], invert=(f_i < 0)) return outputs def generate_images_pred(self, inputs, outputs, is_night): """Generate the warped (reprojected) color images for a minibatch. Generated images are saved into the `outputs` dictionary. """ _, _, height, width = inputs['color', 0, 0].shape for scale in self.scales: disp = outputs[("disp", scale)] if self.v1_multiscale: source_scale = scale else: disp = F.interpolate(disp, [height, width], mode="bilinear", align_corners=False) source_scale = 0 _, depth = disp_to_depth(disp, self.min_depth, self.max_depth) outputs[("depth", 0, scale)] = depth for i, frame_id in enumerate(self.frame_ids[1:]): T = outputs[("cam_T_cam", 0, frame_id)] cam_points = self.backproject_depth[source_scale]( depth, inputs[("inv_K", source_scale)]) pix_coords = self.project_3d[source_scale]( cam_points, inputs[("K", source_scale)], T) outputs[("sample", frame_id, scale)] = pix_coords if is_night: inputs[("color_n", frame_id, source_scale)].stop_gradient = False outputs[("color", frame_id, scale)] = paddle.nn.functional.grid_sample( inputs[("color_n", frame_id, source_scale)], outputs[("sample", frame_id, scale)], padding_mode="border", align_corners=False) else: inputs[("color", frame_id, source_scale)].stop_gradient = False outputs[("color", frame_id, scale)] = paddle.nn.functional.grid_sample( inputs[("color", frame_id, source_scale)], outputs[("sample", frame_id, scale)], padding_mode="border", align_corners=False) if not self.disable_automasking: if is_night: outputs[("color_identity", frame_id, scale)] = \ inputs[("color_n", frame_id, source_scale)] else: outputs[("color_identity", frame_id, scale)] = \ inputs[("color", frame_id, source_scale)]