You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
291 lines
10 KiB
Python
291 lines
10 KiB
Python
2 years ago
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
|
||
|
import math
|
||
|
import numpy as np
|
||
|
import paddle
|
||
|
from paddle import ParamAttr
|
||
|
from ..registry import BACKBONES
|
||
|
|
||
|
|
||
|
def _get_interp1d_bin_mask(seg_xmin, seg_xmax, tscale, num_sample,
|
||
|
num_sample_perbin):
|
||
|
""" generate sample mask for a boundary-matching pair """
|
||
|
plen = float(seg_xmax - seg_xmin)
|
||
|
plen_sample = plen / (num_sample * num_sample_perbin - 1.0)
|
||
|
total_samples = [
|
||
|
seg_xmin + plen_sample * ii
|
||
|
for ii in range(num_sample * num_sample_perbin)
|
||
|
]
|
||
|
p_mask = []
|
||
|
for idx in range(num_sample):
|
||
|
bin_samples = total_samples[idx * num_sample_perbin:(idx + 1) *
|
||
|
num_sample_perbin]
|
||
|
bin_vector = np.zeros([tscale])
|
||
|
for sample in bin_samples:
|
||
|
sample_upper = math.ceil(sample)
|
||
|
sample_decimal, sample_down = math.modf(sample)
|
||
|
if (tscale - 1) >= int(sample_down) >= 0:
|
||
|
bin_vector[int(sample_down)] += 1 - sample_decimal
|
||
|
if (tscale - 1) >= int(sample_upper) >= 0:
|
||
|
bin_vector[int(sample_upper)] += sample_decimal
|
||
|
bin_vector = 1.0 / num_sample_perbin * bin_vector
|
||
|
p_mask.append(bin_vector)
|
||
|
p_mask = np.stack(p_mask, axis=1)
|
||
|
return p_mask
|
||
|
|
||
|
|
||
|
def get_interp1d_mask(tscale, dscale, prop_boundary_ratio, num_sample,
|
||
|
num_sample_perbin):
|
||
|
""" generate sample mask for each point in Boundary-Matching Map """
|
||
|
mask_mat = []
|
||
|
for start_index in range(tscale):
|
||
|
mask_mat_vector = []
|
||
|
for duration_index in range(dscale):
|
||
|
if start_index + duration_index < tscale:
|
||
|
p_xmin = start_index
|
||
|
p_xmax = start_index + duration_index
|
||
|
center_len = float(p_xmax - p_xmin) + 1
|
||
|
sample_xmin = p_xmin - center_len * prop_boundary_ratio
|
||
|
sample_xmax = p_xmax + center_len * prop_boundary_ratio
|
||
|
p_mask = _get_interp1d_bin_mask(sample_xmin, sample_xmax,
|
||
|
tscale, num_sample,
|
||
|
num_sample_perbin)
|
||
|
else:
|
||
|
p_mask = np.zeros([tscale, num_sample])
|
||
|
mask_mat_vector.append(p_mask)
|
||
|
mask_mat_vector = np.stack(mask_mat_vector, axis=2)
|
||
|
mask_mat.append(mask_mat_vector)
|
||
|
mask_mat = np.stack(mask_mat, axis=3)
|
||
|
mask_mat = mask_mat.astype(np.float32)
|
||
|
|
||
|
sample_mask = np.reshape(mask_mat, [tscale, -1])
|
||
|
return sample_mask
|
||
|
|
||
|
|
||
|
def init_params(name, in_channels, kernel_size):
|
||
|
fan_in = in_channels * kernel_size * 1
|
||
|
k = 1. / math.sqrt(fan_in)
|
||
|
param_attr = ParamAttr(name=name,
|
||
|
initializer=paddle.nn.initializer.Uniform(low=-k,
|
||
|
high=k))
|
||
|
return param_attr
|
||
|
|
||
|
|
||
|
@BACKBONES.register()
|
||
|
class BMN(paddle.nn.Layer):
|
||
|
"""BMN model from
|
||
|
`"BMN: Boundary-Matching Network for Temporal Action Proposal Generation" <https://arxiv.org/abs/1907.09702>`_
|
||
|
Args:
|
||
|
tscale (int): sequence length, default 100.
|
||
|
dscale (int): max duration length, default 100.
|
||
|
prop_boundary_ratio (float): ratio of expanded temporal region in proposal boundary, default 0.5.
|
||
|
num_sample (int): number of samples betweent starting boundary and ending boundary of each propoasl, default 32.
|
||
|
num_sample_perbin (int): number of selected points in each sample, default 3.
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
tscale,
|
||
|
dscale,
|
||
|
prop_boundary_ratio,
|
||
|
num_sample,
|
||
|
num_sample_perbin,
|
||
|
feat_dim=400,
|
||
|
):
|
||
|
super(BMN, self).__init__()
|
||
|
|
||
|
#init config
|
||
|
self.feat_dim = feat_dim
|
||
|
self.tscale = tscale
|
||
|
self.dscale = dscale
|
||
|
self.prop_boundary_ratio = prop_boundary_ratio
|
||
|
self.num_sample = num_sample
|
||
|
self.num_sample_perbin = num_sample_perbin
|
||
|
|
||
|
self.hidden_dim_1d = 256
|
||
|
self.hidden_dim_2d = 128
|
||
|
self.hidden_dim_3d = 512
|
||
|
|
||
|
# Base Module
|
||
|
self.b_conv1 = paddle.nn.Conv1D(
|
||
|
in_channels=self.feat_dim,
|
||
|
out_channels=self.hidden_dim_1d,
|
||
|
kernel_size=3,
|
||
|
padding=1,
|
||
|
groups=4,
|
||
|
weight_attr=init_params('Base_1_w', self.feat_dim, 3),
|
||
|
bias_attr=init_params('Base_1_b', self.feat_dim, 3))
|
||
|
self.b_conv1_act = paddle.nn.ReLU()
|
||
|
|
||
|
self.b_conv2 = paddle.nn.Conv1D(
|
||
|
in_channels=self.hidden_dim_1d,
|
||
|
out_channels=self.hidden_dim_1d,
|
||
|
kernel_size=3,
|
||
|
padding=1,
|
||
|
groups=4,
|
||
|
weight_attr=init_params('Base_2_w', self.hidden_dim_1d, 3),
|
||
|
bias_attr=init_params('Base_2_b', self.hidden_dim_1d, 3))
|
||
|
self.b_conv2_act = paddle.nn.ReLU()
|
||
|
|
||
|
# Temporal Evaluation Module
|
||
|
self.ts_conv1 = paddle.nn.Conv1D(
|
||
|
in_channels=self.hidden_dim_1d,
|
||
|
out_channels=self.hidden_dim_1d,
|
||
|
kernel_size=3,
|
||
|
padding=1,
|
||
|
groups=4,
|
||
|
weight_attr=init_params('TEM_s1_w', self.hidden_dim_1d, 3),
|
||
|
bias_attr=init_params('TEM_s1_b', self.hidden_dim_1d, 3))
|
||
|
self.ts_conv1_act = paddle.nn.ReLU()
|
||
|
|
||
|
self.ts_conv2 = paddle.nn.Conv1D(
|
||
|
in_channels=self.hidden_dim_1d,
|
||
|
out_channels=1,
|
||
|
kernel_size=1,
|
||
|
padding=0,
|
||
|
groups=1,
|
||
|
weight_attr=init_params('TEM_s2_w', self.hidden_dim_1d, 1),
|
||
|
bias_attr=init_params('TEM_s2_b', self.hidden_dim_1d, 1))
|
||
|
self.ts_conv2_act = paddle.nn.Sigmoid()
|
||
|
|
||
|
self.te_conv1 = paddle.nn.Conv1D(
|
||
|
in_channels=self.hidden_dim_1d,
|
||
|
out_channels=self.hidden_dim_1d,
|
||
|
kernel_size=3,
|
||
|
padding=1,
|
||
|
groups=4,
|
||
|
weight_attr=init_params('TEM_e1_w', self.hidden_dim_1d, 3),
|
||
|
bias_attr=init_params('TEM_e1_b', self.hidden_dim_1d, 3))
|
||
|
self.te_conv1_act = paddle.nn.ReLU()
|
||
|
self.te_conv2 = paddle.nn.Conv1D(
|
||
|
in_channels=self.hidden_dim_1d,
|
||
|
out_channels=1,
|
||
|
kernel_size=1,
|
||
|
padding=0,
|
||
|
groups=1,
|
||
|
weight_attr=init_params('TEM_e2_w', self.hidden_dim_1d, 1),
|
||
|
bias_attr=init_params('TEM_e2_b', self.hidden_dim_1d, 1))
|
||
|
self.te_conv2_act = paddle.nn.Sigmoid()
|
||
|
|
||
|
#Proposal Evaluation Module
|
||
|
self.p_conv1 = paddle.nn.Conv1D(
|
||
|
in_channels=self.hidden_dim_1d,
|
||
|
out_channels=self.hidden_dim_2d,
|
||
|
kernel_size=3,
|
||
|
padding=1,
|
||
|
groups=1,
|
||
|
weight_attr=init_params('PEM_1d_w', self.hidden_dim_1d, 3),
|
||
|
bias_attr=init_params('PEM_1d_b', self.hidden_dim_1d, 3))
|
||
|
self.p_conv1_act = paddle.nn.ReLU()
|
||
|
|
||
|
# init to speed up
|
||
|
sample_mask = get_interp1d_mask(self.tscale, self.dscale,
|
||
|
self.prop_boundary_ratio,
|
||
|
self.num_sample, self.num_sample_perbin)
|
||
|
self.sample_mask = paddle.to_tensor(sample_mask)
|
||
|
self.sample_mask.stop_gradient = True
|
||
|
|
||
|
self.p_conv3d1 = paddle.nn.Conv3D(
|
||
|
in_channels=128,
|
||
|
out_channels=self.hidden_dim_3d,
|
||
|
kernel_size=(self.num_sample, 1, 1),
|
||
|
stride=(self.num_sample, 1, 1),
|
||
|
padding=0,
|
||
|
weight_attr=ParamAttr(name="PEM_3d1_w"),
|
||
|
bias_attr=ParamAttr(name="PEM_3d1_b"))
|
||
|
self.p_conv3d1_act = paddle.nn.ReLU()
|
||
|
|
||
|
self.p_conv2d1 = paddle.nn.Conv2D(
|
||
|
in_channels=512,
|
||
|
out_channels=self.hidden_dim_2d,
|
||
|
kernel_size=1,
|
||
|
stride=1,
|
||
|
padding=0,
|
||
|
weight_attr=ParamAttr(name="PEM_2d1_w"),
|
||
|
bias_attr=ParamAttr(name="PEM_2d1_b"))
|
||
|
self.p_conv2d1_act = paddle.nn.ReLU()
|
||
|
|
||
|
self.p_conv2d2 = paddle.nn.Conv2D(
|
||
|
in_channels=128,
|
||
|
out_channels=self.hidden_dim_2d,
|
||
|
kernel_size=3,
|
||
|
stride=1,
|
||
|
padding=1,
|
||
|
weight_attr=ParamAttr(name="PEM_2d2_w"),
|
||
|
bias_attr=ParamAttr(name="PEM_2d2_b"))
|
||
|
self.p_conv2d2_act = paddle.nn.ReLU()
|
||
|
|
||
|
self.p_conv2d3 = paddle.nn.Conv2D(
|
||
|
in_channels=128,
|
||
|
out_channels=self.hidden_dim_2d,
|
||
|
kernel_size=3,
|
||
|
stride=1,
|
||
|
padding=1,
|
||
|
weight_attr=ParamAttr(name="PEM_2d3_w"),
|
||
|
bias_attr=ParamAttr(name="PEM_2d3_b"))
|
||
|
self.p_conv2d3_act = paddle.nn.ReLU()
|
||
|
|
||
|
self.p_conv2d4 = paddle.nn.Conv2D(
|
||
|
in_channels=128,
|
||
|
out_channels=2,
|
||
|
kernel_size=1,
|
||
|
stride=1,
|
||
|
padding=0,
|
||
|
weight_attr=ParamAttr(name="PEM_2d4_w"),
|
||
|
bias_attr=ParamAttr(name="PEM_2d4_b"))
|
||
|
self.p_conv2d4_act = paddle.nn.Sigmoid()
|
||
|
|
||
|
def init_weights(self):
|
||
|
pass
|
||
|
|
||
|
def forward(self, x):
|
||
|
#Base Module
|
||
|
x = self.b_conv1(x)
|
||
|
x = self.b_conv1_act(x)
|
||
|
x = self.b_conv2(x)
|
||
|
x = self.b_conv2_act(x)
|
||
|
|
||
|
#TEM
|
||
|
xs = self.ts_conv1(x)
|
||
|
xs = self.ts_conv1_act(xs)
|
||
|
xs = self.ts_conv2(xs)
|
||
|
xs = self.ts_conv2_act(xs)
|
||
|
xs = paddle.squeeze(xs, axis=[1])
|
||
|
xe = self.te_conv1(x)
|
||
|
xe = self.te_conv1_act(xe)
|
||
|
xe = self.te_conv2(xe)
|
||
|
xe = self.te_conv2_act(xe)
|
||
|
xe = paddle.squeeze(xe, axis=[1])
|
||
|
|
||
|
#PEM
|
||
|
xp = self.p_conv1(x)
|
||
|
xp = self.p_conv1_act(xp)
|
||
|
#BM layer
|
||
|
xp = paddle.matmul(xp, self.sample_mask)
|
||
|
xp = paddle.reshape(xp, shape=[0, 0, -1, self.dscale, self.tscale])
|
||
|
|
||
|
xp = self.p_conv3d1(xp)
|
||
|
xp = self.p_conv3d1_act(xp)
|
||
|
xp = paddle.squeeze(xp, axis=[2])
|
||
|
xp = self.p_conv2d1(xp)
|
||
|
xp = self.p_conv2d1_act(xp)
|
||
|
xp = self.p_conv2d2(xp)
|
||
|
xp = self.p_conv2d2_act(xp)
|
||
|
xp = self.p_conv2d3(xp)
|
||
|
xp = self.p_conv2d3_act(xp)
|
||
|
xp = self.p_conv2d4(xp)
|
||
|
xp = self.p_conv2d4_act(xp)
|
||
|
return xp, xs, xe
|