You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
268 lines
9.3 KiB
Python
268 lines
9.3 KiB
Python
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import argparse
|
|
import os
|
|
import os.path as osp
|
|
import sys
|
|
|
|
import paddle
|
|
from paddle.jit import to_static
|
|
from paddle.static import InputSpec
|
|
|
|
__dir__ = os.path.dirname(os.path.abspath(__file__))
|
|
sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
|
|
|
|
from paddlevideo.modeling.builder import build_model
|
|
from paddlevideo.utils import get_config
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser("PaddleVideo export model script")
|
|
parser.add_argument('-c',
|
|
'--config',
|
|
type=str,
|
|
default='configs/example.yaml',
|
|
help='config file path')
|
|
parser.add_argument('--override',
|
|
action='append',
|
|
default=[],
|
|
help='config options to be overridden')
|
|
parser.add_argument("-p",
|
|
"--pretrained_params",
|
|
default='./best.pdparams',
|
|
type=str,
|
|
help='params path')
|
|
parser.add_argument("-o",
|
|
"--output_path",
|
|
type=str,
|
|
default="./inference",
|
|
help='output path')
|
|
|
|
parser.add_argument('--save_name',
|
|
type=str,
|
|
default=None,
|
|
help='specify the exported inference \
|
|
files(pdiparams and pdmodel) name,\
|
|
only used in TIPC')
|
|
|
|
return parser.parse_args()
|
|
|
|
|
|
def trim_config(cfg):
|
|
"""
|
|
Reuse the trainging config will bring useless attributes, such as: backbone.pretrained model.
|
|
and some build phase attributes should be overrided, such as: backbone.num_seg.
|
|
Trim it here.
|
|
"""
|
|
model_name = cfg.model_name
|
|
if cfg.MODEL.get('backbone') and cfg.MODEL.backbone.get('pretrained'):
|
|
cfg.MODEL.backbone.pretrained = "" # not ued when inference
|
|
|
|
# for distillation
|
|
if cfg.MODEL.get('models'):
|
|
if cfg.MODEL.models[0]['Teacher']['backbone'].get('pretrained'):
|
|
cfg.MODEL.models[0]['Teacher']['backbone']['pretrained'] = ""
|
|
if cfg.MODEL.models[1]['Student']['backbone'].get('pretrained'):
|
|
cfg.MODEL.models[1]['Student']['backbone']['pretrained'] = ""
|
|
|
|
return cfg, model_name
|
|
|
|
|
|
def get_input_spec(cfg, model_name):
|
|
if model_name in ['ppTSM', 'TSM', 'MoViNet', 'ppTSMv2']:
|
|
input_spec = [[
|
|
InputSpec(
|
|
shape=[None, cfg.num_seg, 3, cfg.target_size, cfg.target_size],
|
|
dtype='float32'),
|
|
]]
|
|
elif model_name in ['TokenShiftVisionTransformer']:
|
|
input_spec = [[
|
|
InputSpec(shape=[
|
|
None, 3, cfg.num_seg * 3, cfg.target_size, cfg.target_size
|
|
],
|
|
dtype='float32'),
|
|
]]
|
|
elif model_name in ['TSN', 'ppTSN']:
|
|
input_spec = [[
|
|
InputSpec(shape=[
|
|
None, cfg.num_seg * 10, 3, cfg.target_size, cfg.target_size
|
|
],
|
|
dtype='float32'),
|
|
]]
|
|
elif model_name in ['BMN']:
|
|
input_spec = [[
|
|
InputSpec(shape=[None, cfg.feat_dim, cfg.tscale],
|
|
dtype='float32',
|
|
name='feat_input'),
|
|
]]
|
|
elif model_name in ['TimeSformer', 'ppTimeSformer']:
|
|
input_spec = [[
|
|
InputSpec(shape=[
|
|
None, 3, cfg.num_seg * 3, cfg.target_size, cfg.target_size
|
|
],
|
|
dtype='float32'),
|
|
]]
|
|
elif model_name in ['VideoSwin']:
|
|
input_spec = [[
|
|
InputSpec(shape=[
|
|
None, 3, cfg.num_seg * cfg.seg_len * 1, cfg.target_size,
|
|
cfg.target_size
|
|
],
|
|
dtype='float32'),
|
|
]]
|
|
elif model_name in ['VideoSwin_TableTennis']:
|
|
input_spec = [[
|
|
InputSpec(shape=[
|
|
None, 3, cfg.num_seg * cfg.seg_len * 3, cfg.target_size,
|
|
cfg.target_size
|
|
],
|
|
dtype='float32'),
|
|
]]
|
|
elif model_name in ['AttentionLSTM']:
|
|
input_spec = [[
|
|
InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[0]],
|
|
dtype='float32'), # for rgb_data
|
|
InputSpec(shape=[
|
|
None,
|
|
], dtype='int64'), # for rgb_len
|
|
InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[0]],
|
|
dtype='float32'), # for rgb_mask
|
|
InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[1]],
|
|
dtype='float32'), # for audio_data
|
|
InputSpec(shape=[
|
|
None,
|
|
], dtype='int64'), # for audio_len
|
|
InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[1]],
|
|
dtype='float32'), # for audio_mask
|
|
]]
|
|
elif model_name in ['SlowFast']:
|
|
input_spec = [[
|
|
InputSpec(shape=[
|
|
None, 3, cfg.num_frames // cfg.alpha, cfg.target_size,
|
|
cfg.target_size
|
|
],
|
|
dtype='float32',
|
|
name='slow_input'),
|
|
InputSpec(shape=[
|
|
None, 3, cfg.num_frames, cfg.target_size, cfg.target_size
|
|
],
|
|
dtype='float32',
|
|
name='fast_input'),
|
|
]]
|
|
elif model_name in ['STGCN', 'AGCN', 'CTRGCN']:
|
|
input_spec = [[
|
|
InputSpec(shape=[
|
|
None, cfg.num_channels, cfg.window_size, cfg.vertex_nums,
|
|
cfg.person_nums
|
|
],
|
|
dtype='float32'),
|
|
]]
|
|
# 由于在模型运行过程中涉及到第一维乘human个数(N*M), 所以这里用1作为shape
|
|
elif model_name in ['AGCN2s']:
|
|
input_spec = [[
|
|
InputSpec(shape=[
|
|
1, cfg.num_channels, cfg.window_size, cfg.vertex_nums,
|
|
cfg.person_nums
|
|
],
|
|
dtype='float32'),
|
|
]]
|
|
elif model_name in ['TransNetV2']:
|
|
input_spec = [[
|
|
InputSpec(shape=[
|
|
None,
|
|
cfg.num_frames,
|
|
cfg.height,
|
|
cfg.width,
|
|
cfg.num_channels,
|
|
],
|
|
dtype='float32'),
|
|
]]
|
|
elif model_name in ['MSTCN', 'ASRF']:
|
|
input_spec = [[
|
|
InputSpec(shape=[None, cfg.num_channels, None], dtype='float32'),
|
|
]]
|
|
elif model_name in ['ADDS']:
|
|
input_spec = [[
|
|
InputSpec(shape=[None, cfg.num_channels, cfg.height, cfg.width],
|
|
dtype='float32'),
|
|
]]
|
|
elif model_name in ['AVA_SlowFast_FastRcnn']:
|
|
input_spec = [[
|
|
InputSpec(shape=[
|
|
None, 3, cfg.num_frames // cfg.alpha, cfg.target_size,
|
|
cfg.target_size
|
|
],
|
|
dtype='float32',
|
|
name='slow_input'),
|
|
InputSpec(shape=[
|
|
None, 3, cfg.num_frames, cfg.target_size, cfg.target_size
|
|
],
|
|
dtype='float32',
|
|
name='fast_input'),
|
|
InputSpec(shape=[None, None, 4], dtype='float32', name='proposals'),
|
|
InputSpec(shape=[None, 2], dtype='float32', name='img_shape')
|
|
]]
|
|
elif model_name in ['PoseC3D']:
|
|
input_spec = [[
|
|
InputSpec(shape=[None, 1, 17, 48, 56, 56], dtype='float32'),
|
|
]]
|
|
elif model_name in ['YOWO']:
|
|
input_spec = [[
|
|
InputSpec(shape=[
|
|
1, 3, cfg.num_seg, cfg.target_size, cfg.target_size
|
|
],
|
|
dtype='float32'),
|
|
]]
|
|
return input_spec
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
cfg, model_name = trim_config(
|
|
get_config(args.config, overrides=args.override, show=False))
|
|
|
|
print(f"Building model({model_name})...")
|
|
model = build_model(cfg.MODEL)
|
|
assert osp.isfile(
|
|
args.pretrained_params
|
|
), f"pretrained params ({args.pretrained_params} is not a file path.)"
|
|
|
|
if not os.path.isdir(args.output_path):
|
|
os.makedirs(args.output_path)
|
|
|
|
print(f"Loading params from ({args.pretrained_params})...")
|
|
params = paddle.load(args.pretrained_params)
|
|
model.set_dict(params)
|
|
|
|
model.eval()
|
|
|
|
# for rep nets
|
|
for layer in model.sublayers():
|
|
if hasattr(layer, "rep") and not getattr(layer, "is_repped"):
|
|
layer.rep()
|
|
|
|
input_spec = get_input_spec(cfg.INFERENCE, model_name)
|
|
model = to_static(model, input_spec=input_spec)
|
|
paddle.jit.save(
|
|
model,
|
|
osp.join(args.output_path,
|
|
model_name if args.save_name is None else args.save_name))
|
|
print(
|
|
f"model ({model_name}) has been already saved in ({args.output_path}).")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|