为了实现Yolov5的推理,我们需要在source/layer/details中实现SiLU、Concat、UpSample和YoloDetect。
详细请看:https://blog.csdn.net/qq_32901731/article/details/129710271

SiLU

相当于sigmoid函数的乘以x。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
InferStatus SiLULayer::Forward(const std::vector<std::shared_ptr<Tensor<float>>> &inputs,
std::vector<std::shared_ptr<Tensor<float>>> &outputs) {
if (inputs.empty()) {
LOG(ERROR) << "The input feature map of silu layer is empty";
return InferStatus::kInferFailedInputEmpty;
}

if (inputs.size() != outputs.size()) {
LOG(ERROR) << "The input and output size of silu layer is not adapting";
return InferStatus::kInferFailedInputOutSizeAdaptingError;
}

const uint32_t batch_size = inputs.size();
#pragma omp parallel for num_threads(batch_size)
for (uint32_t i = 0; i < batch_size; ++i) {
// 遍历每一批次
const std::shared_ptr<Tensor<float>> &input = inputs.at(i);
CHECK(input == nullptr || !input->empty()) << "The input feature map of silu layer is empty!";

std::shared_ptr<Tensor<float>> output = outputs.at(i);
if (output == nullptr || output->empty()) {
output = std::make_shared<Tensor<float>>(input->shapes());
outputs.at(i) = output;
}
CHECK(output->shapes() == input->shapes()) << "The output size of silu layer is error";

// 将input中的数据拷贝到output
output->set_data(input->data());
// SiLU
output->Transform([](const float value) {
return value / (1.f + expf(-value));
});
}
return InferStatus::kInferSuccess;
}

Concat

将多个张量在通道维(channel dim)进行拼接

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
InferStatus CatLayer::Forward(
const std::vector<std::shared_ptr<Tensor<float>>>& inputs,
std::vector<std::shared_ptr<Tensor<float>>>& outputs) {
if (inputs.empty()) {
LOG(ERROR) << "The input feature map of cat layer is empty";
return InferStatus::kInferFailedInputEmpty;
}

if (inputs.size() == outputs.size()) {
LOG(ERROR) << "The input and output size is not adapting";
return InferStatus::kInferFailedInputOutSizeAdaptingError;
}

if (dim_ != 1 && dim_ != -3) {
LOG(ERROR) << "The dimension of cat layer is error";
return InferStatus::kInferFailedDimensionParameterError;
}

const uint32_t output_size = outputs.size();
CHECK(inputs.size() % output_size == 0);
const uint32_t packet_size = inputs.size() / output_size;

#pragma omp parallel for num_threads(outputs.size())
for (uint32_t i = 0; i < outputs.size(); ++i) {
std::shared_ptr<Tensor<float>> output = outputs.at(i);
uint32_t start_channel = 0;
uint32_t rows = inputs.front()->rows();
uint32_t cols = inputs.front()->cols();

for (uint32_t j = i; j < inputs.size(); j += output_size) {
const std::shared_ptr<Tensor<float>>& input = inputs.at(j);
CHECK(input != nullptr && !input->empty())
<< "The input feature map of cat layer is empty";

const uint32_t in_channels = input->channels();
CHECK(rows == input->rows() && cols == input->cols());

if (output == nullptr || output->empty()) {
output = std::make_shared<Tensor<float>>(in_channels * packet_size,
rows, cols);
outputs.at(i) = output;
}

// 检查output的通道数量等于input数组的数量乘以input的维度
CHECK(output->channels() == in_channels * packet_size &&
output->rows() == rows && output->cols() == cols);

// 将逐个输入在output的通道维上拼接起来
for (uint32_t c = 0; c < in_channels; ++c) {
output->slice(start_channel + c) = input->slice(c);
}
start_channel += input->channels();
}
}
return InferStatus::kInferSuccess;
}

UpSample

输入的大小(width和height)放大到指定的scale倍而已,放大的方法这里采用了nearest方法,
也就是通过复制最近点的值来进行上采样。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
InferStatus UpSampleLayer::Forward(const std::vector<std::shared_ptr<Tensor<float>>> &inputs,
std::vector<std::shared_ptr<Tensor<float>>> &outputs) {
if (inputs.empty()) {
LOG(ERROR) << "The input feature map of upsample layer is empty";
return InferStatus::kInferFailedInputEmpty;
}

if (inputs.size() != outputs.size()) {
LOG(ERROR) << "The input and output size is not adapting";
return InferStatus::kInferFailedInputOutSizeAdaptingError;
}

for (uint32_t i = 0; i < inputs.size(); ++i) {
const sftensor &input_data = inputs.at(i);
if (input_data == nullptr || input_data->empty()) {
LOG(ERROR) << "The input feature map of upsample layer is empty";
return InferStatus::kInferFailedInputEmpty;
}
}

LOG_IF(FATAL, this->mode_ != UpSampleMode::kModeNearest) << "Unsupported upsample mode: " << int(mode_);

const uint32_t batch_size = inputs.size();
#pragma omp parallel for num_threads(batch_size)
for (uint32_t i = 0; i < batch_size; ++i) {
// 遍历每一批次
const arma::fcube &input_data = inputs.at(i)->data();

std::shared_ptr<Tensor<float>> output = outputs.at(i);
if (output == nullptr || output->empty()) {
output = std::make_shared<Tensor<float>>(input_data.n_slices,
uint32_t(input_data.n_rows * scale_h_),
uint32_t(input_data.n_cols * scale_w_));
outputs.at(i) = output;
}
auto &output_data = output->data();
// 检查output的空间是否放得下上采样后的输入
CHECK(output_data.n_rows == input_data.n_rows * scale_h_) << "The height of the feature map is not adapting!";
CHECK(output_data.n_cols == input_data.n_cols * scale_w_) << "The width of the feature map is not adapting!";
CHECK(input_data.n_slices == output_data.n_slices) << "The channel of the feature map is not adapting!";

const uint32_t channels = input_data.n_slices;
for (uint32_t c = 0; c < channels; ++c) {
// 遍历每一通道
const arma::fmat &input_channel = input_data.slice(c);
arma::fmat &output_channel = output_data.slice(c);
const uint32_t output_w = output_channel.n_cols;
const uint32_t output_h = output_channel.n_rows;

for (uint32_t w = 0; w < output_w; ++w) {
// 将output_channel上的坐标除以scale_w,得到它在输入input_channel上的坐标src_w
const uint32_t src_w = uint32_t((float) w / this->scale_w_);
CHECK(src_w < input_channel.n_cols);

float *output_channel_ptr = output_channel.colptr(w);
const float *input_channel_ptr = input_channel.colptr(src_w);

for (uint32_t h = 0; h < output_h; ++h) {
// 将output_channel上的坐标除以scale_h,得到它在输入input_channel上的坐标src_h
const uint32_t src_h = uint32_t((float) h / this->scale_h_);
CHECK(src_h < input_channel.n_rows);

// 根据src_h和src_w位置的值来赋值
const float src_value = *(input_channel_ptr + src_h);
*(output_channel_ptr + h) = src_value;
}
}
}
}
return InferStatus::kInferSuccess;
}

YoloDetect

YoloDetect的Python定义如下,直接摘录自YoloV5项目的yolo.py文件。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
def forward(self, x):
z = [] # inference output
for i in range(self.nl):
x[i] = self.m[i](x[i]) # conv
bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85)
x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()

if not self.training: # inference
省略...
else: # Detect (boxes only)
xy, wh, conf = x[i].sigmoid().split((2, 2, self.nc + 1), 4)
xy = (xy * 2 + self.grid[i]) * self.stride[i] # xy
wh = (wh * 2) ** 2 * self.anchor_grid[i] # wh
y = torch.cat((xy, wh, conf), 4)
z.append(y.view(bs, self.na * nx * ny, self.no))

YoloDetectLayer的定义

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
class YoloDetectLayer : public Layer {
public:
explicit YoloDetectLayer(int32_t stages,
int32_t num_classes,
const std::vector<float> &strides,
const std::vector<arma::fmat> &anchor_grids,
const std::vector<arma::fmat> &grids,
const std::vector<std::shared_ptr<ConvolutionLayer>> &conv_layers);

InferStatus Forward(const std::vector<std::shared_ptr<Tensor<float>>> &inputs,
std::vector<std::shared_ptr<Tensor<float>>> &outputs) override;

static ParseParameterAttrStatus GetInstance(const std::shared_ptr<RuntimeOperator> &op,
std::shared_ptr<Layer> &yolo_detect_layer);
private:
int32_t stages_ = 0;
int32_t num_classes_ = 0;
std::vector<float> strides_;
std::vector<arma::fmat> anchor_grids_;
std::vector<arma::fmat> grids_;
std::vector<std::shared_ptr<ConvolutionLayer>> conv_layers_;
};

YoloDetectLayer::Forward实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
InferStatus YoloDetectLayer::Forward(
const std::vector<std::shared_ptr<Tensor<float>>> &inputs,
std::vector<std::shared_ptr<Tensor<float>>> &outputs) {
if (inputs.empty()) {
LOG(ERROR) << "The input feature map of yolo detect layer is empty";
return InferStatus::kInferFailedInputEmpty;
}

const uint32_t stages = stages_;
const uint32_t classes_info = num_classes_ + 5;
const uint32_t input_size = inputs.size();
const uint32_t batch_size = outputs.size();

if (input_size / batch_size != stages_ || input_size % batch_size != 0) {
LOG(ERROR) << "The input and output number of yolo detect layer is wrong";
return InferStatus::kInferFailedYoloStageNumberError;
}

CHECK(!this->conv_layers_.empty() && this->conv_layers_.size() == stages)
<< "The convolution layers in yolo detection layer is empty or do not "
"have a correct number";

std::vector<std::vector<std::shared_ptr<Tensor<float>>>> batches(stages);
for (uint32_t i = 0; i < input_size; ++i) {
const uint32_t index = i / batch_size;
const auto &input_data = inputs.at(i);
if (input_data == nullptr || input_data->empty()) {
LOG(ERROR) << "The input feature map of yolo detect layer is empty";
return InferStatus::kInferFailedInputEmpty;
}
CHECK(index <= batches.size());
batches.at(index).push_back(input_data);
}

// conv
std::vector<std::vector<sftensor>> stage_outputs(stages);
#pragma omp parallel for num_threads(stages)
for (uint32_t stage = 0; stage < stages; ++stage) {
const std::vector<std::shared_ptr<Tensor<float>>> &stage_input =
batches.at(stage);

CHECK(stage_input.size() == batch_size)
<< "The number of stage do not equal to batch size";

std::vector<std::shared_ptr<Tensor<float>>> stage_output(batch_size);
const auto status =
this->conv_layers_.at(stage)->Forward(stage_input, stage_output);

CHECK(status == InferStatus::kInferSuccess)
<< "Infer failed, error code: " << int(status);
CHECK(stage_output.size() == batch_size)
<< "The number of stage output do not equal to batch size";
stage_outputs.at(stage) = stage_output;
}

uint32_t concat_rows = 0;
std::vector<std::shared_ptr<Tensor<float>>> zs(stages);
for (uint32_t stage = 0; stage < stages; ++stage) {
const std::vector<sftensor> stage_output = stage_outputs.at(stage);
const uint32_t nx_ = stage_output.front()->rows();
const uint32_t ny_ = stage_output.front()->cols();
for (uint32_t i = 0; i < stage_output.size(); ++i) {
CHECK(stage_output.at(i)->rows() == nx_ &&
stage_output.at(i)->cols() == ny_);
}

std::shared_ptr<Tensor<float>> x_stages_tensor;
x_stages_tensor =
TensorCreate(batch_size, stages * nx_ * ny_, uint32_t(classes_info));

#pragma omp parallel for num_threads(batch_size)
for (uint32_t b = 0; b < batch_size; ++b) {
// 遍历每一批次
const std::shared_ptr<Tensor<float>> &input = stage_output.at(b);
CHECK(input != nullptr && !input->empty());
const uint32_t nx = input->rows();
const uint32_t ny = input->cols();
// 将input张量reshape到对应的形状(stages, classes_info, ny*nx)
input->ReRawView({stages, uint32_t(classes_info), ny * nx});
const uint32_t size = input->size();

// x[i].sigmoid()
input->Transform(
[](const float value) { return 1.f / (1.f + expf(-value)); });

// .split(2, 2, self.nc + 1), 4)
arma::fmat &x_stages = x_stages_tensor->slice(b);
for (uint32_t s = 0; s < stages; ++s) {
x_stages.submat(ny * nx * s, 0, ny * nx * (s + 1) - 1,
classes_info - 1) = input->slice(s).t();
}

// xy = (xy * 2 + self.grid[i]) * self.stride[i]
const arma::fmat &xy = x_stages.submat(0, 0, x_stages.n_rows - 1, 1);
// wh = (wh * 2) ** 2 * self.anchor_grid[i]
const arma::fmat &wh = x_stages.submat(0, 2, x_stages.n_rows - 1, 3);

// y = torch.cat((xy, wh, conf), 4)
x_stages.submat(0, 0, x_stages.n_rows - 1, 1) =
(xy * 2 + grids_[stage]) * strides_[stage];
x_stages.submat(0, 2, x_stages.n_rows - 1, 3) =
arma::pow((wh * 2), 2) % anchor_grids_[stage];
}
concat_rows += x_stages_tensor->rows();
// 一个stage(检测头)中所有批次的数据在处理完之后都会被放到x_stages和zs.at(stage)的位置
zs.at(stage) = x_stages_tensor;
}

// 将三个检测头的输出重新拼接起来,并存放到f1的位置
uint32_t current_rows = 0;
arma::fcube f1(concat_rows, classes_info, batch_size);
for (const auto &z : zs) {
f1.subcube(current_rows, 0, 0, current_rows + z->rows() - 1,
classes_info - 1, batch_size - 1) = z->data();
current_rows += z->rows();
}

for (int i = 0; i < f1.n_slices; ++i) {
std::shared_ptr<Tensor<float>> output = outputs.at(i);
if (output == nullptr || output->empty()) {
output = std::make_shared<Tensor<float>>(1, concat_rows, classes_info);
outputs.at(i) = output;
}
CHECK(output->rows() == f1.slice(i).n_rows);
CHECK(output->cols() == f1.slice(i).n_cols);
output->slice(0) = std::move(f1.slice(i));
}
return InferStatus::kInferSuccess;
}