自制深度学习框架--实现Yolov5的推理
为了实现Yolov5的推理,我们需要在source/layer/details中实现SiLU、Concat、UpSample和YoloDetect。
详细请看:https://blog.csdn.net/qq_32901731/article/details/129710271
SiLU
相当于sigmoid函数的乘以x。
1 | InferStatus SiLULayer::Forward(const std::vector<std::shared_ptr<Tensor<float>>> &inputs, |
Concat
将多个张量在通道维(channel dim)进行拼接1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56InferStatus CatLayer::Forward(
const std::vector<std::shared_ptr<Tensor<float>>>& inputs,
std::vector<std::shared_ptr<Tensor<float>>>& outputs) {
if (inputs.empty()) {
LOG(ERROR) << "The input feature map of cat layer is empty";
return InferStatus::kInferFailedInputEmpty;
}
if (inputs.size() == outputs.size()) {
LOG(ERROR) << "The input and output size is not adapting";
return InferStatus::kInferFailedInputOutSizeAdaptingError;
}
if (dim_ != 1 && dim_ != -3) {
LOG(ERROR) << "The dimension of cat layer is error";
return InferStatus::kInferFailedDimensionParameterError;
}
const uint32_t output_size = outputs.size();
CHECK(inputs.size() % output_size == 0);
const uint32_t packet_size = inputs.size() / output_size;
for (uint32_t i = 0; i < outputs.size(); ++i) {
std::shared_ptr<Tensor<float>> output = outputs.at(i);
uint32_t start_channel = 0;
uint32_t rows = inputs.front()->rows();
uint32_t cols = inputs.front()->cols();
for (uint32_t j = i; j < inputs.size(); j += output_size) {
const std::shared_ptr<Tensor<float>>& input = inputs.at(j);
CHECK(input != nullptr && !input->empty())
<< "The input feature map of cat layer is empty";
const uint32_t in_channels = input->channels();
CHECK(rows == input->rows() && cols == input->cols());
if (output == nullptr || output->empty()) {
output = std::make_shared<Tensor<float>>(in_channels * packet_size,
rows, cols);
outputs.at(i) = output;
}
// 检查output的通道数量等于input数组的数量乘以input的维度
CHECK(output->channels() == in_channels * packet_size &&
output->rows() == rows && output->cols() == cols);
// 将逐个输入在output的通道维上拼接起来
for (uint32_t c = 0; c < in_channels; ++c) {
output->slice(start_channel + c) = input->slice(c);
}
start_channel += input->channels();
}
}
return InferStatus::kInferSuccess;
}
UpSample
输入的大小(width和height)放大到指定的scale倍而已,放大的方法这里采用了nearest方法,
也就是通过复制最近点的值来进行上采样。
1 | InferStatus UpSampleLayer::Forward(const std::vector<std::shared_ptr<Tensor<float>>> &inputs, |
YoloDetect
YoloDetect的Python定义如下,直接摘录自YoloV5项目的yolo.py文件。1
2
3
4
5
6
7
8
9
10
11
12
13
14
15def forward(self, x):
z = [] # inference output
for i in range(self.nl):
x[i] = self.m[i](x[i]) # conv
bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85)
x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
if not self.training: # inference
省略...
else: # Detect (boxes only)
xy, wh, conf = x[i].sigmoid().split((2, 2, self.nc + 1), 4)
xy = (xy * 2 + self.grid[i]) * self.stride[i] # xy
wh = (wh * 2) ** 2 * self.anchor_grid[i] # wh
y = torch.cat((xy, wh, conf), 4)
z.append(y.view(bs, self.na * nx * ny, self.no))
YoloDetectLayer的定义1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22class YoloDetectLayer : public Layer {
public:
explicit YoloDetectLayer(int32_t stages,
int32_t num_classes,
const std::vector<float> &strides,
const std::vector<arma::fmat> &anchor_grids,
const std::vector<arma::fmat> &grids,
const std::vector<std::shared_ptr<ConvolutionLayer>> &conv_layers);
InferStatus Forward(const std::vector<std::shared_ptr<Tensor<float>>> &inputs,
std::vector<std::shared_ptr<Tensor<float>>> &outputs) override;
static ParseParameterAttrStatus GetInstance(const std::shared_ptr<RuntimeOperator> &op,
std::shared_ptr<Layer> &yolo_detect_layer);
private:
int32_t stages_ = 0;
int32_t num_classes_ = 0;
std::vector<float> strides_;
std::vector<arma::fmat> anchor_grids_;
std::vector<arma::fmat> grids_;
std::vector<std::shared_ptr<ConvolutionLayer>> conv_layers_;
};
YoloDetectLayer::Forward实现1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129InferStatus YoloDetectLayer::Forward(
const std::vector<std::shared_ptr<Tensor<float>>> &inputs,
std::vector<std::shared_ptr<Tensor<float>>> &outputs) {
if (inputs.empty()) {
LOG(ERROR) << "The input feature map of yolo detect layer is empty";
return InferStatus::kInferFailedInputEmpty;
}
const uint32_t stages = stages_;
const uint32_t classes_info = num_classes_ + 5;
const uint32_t input_size = inputs.size();
const uint32_t batch_size = outputs.size();
if (input_size / batch_size != stages_ || input_size % batch_size != 0) {
LOG(ERROR) << "The input and output number of yolo detect layer is wrong";
return InferStatus::kInferFailedYoloStageNumberError;
}
CHECK(!this->conv_layers_.empty() && this->conv_layers_.size() == stages)
<< "The convolution layers in yolo detection layer is empty or do not "
"have a correct number";
std::vector<std::vector<std::shared_ptr<Tensor<float>>>> batches(stages);
for (uint32_t i = 0; i < input_size; ++i) {
const uint32_t index = i / batch_size;
const auto &input_data = inputs.at(i);
if (input_data == nullptr || input_data->empty()) {
LOG(ERROR) << "The input feature map of yolo detect layer is empty";
return InferStatus::kInferFailedInputEmpty;
}
CHECK(index <= batches.size());
batches.at(index).push_back(input_data);
}
// conv
std::vector<std::vector<sftensor>> stage_outputs(stages);
for (uint32_t stage = 0; stage < stages; ++stage) {
const std::vector<std::shared_ptr<Tensor<float>>> &stage_input =
batches.at(stage);
CHECK(stage_input.size() == batch_size)
<< "The number of stage do not equal to batch size";
std::vector<std::shared_ptr<Tensor<float>>> stage_output(batch_size);
const auto status =
this->conv_layers_.at(stage)->Forward(stage_input, stage_output);
CHECK(status == InferStatus::kInferSuccess)
<< "Infer failed, error code: " << int(status);
CHECK(stage_output.size() == batch_size)
<< "The number of stage output do not equal to batch size";
stage_outputs.at(stage) = stage_output;
}
uint32_t concat_rows = 0;
std::vector<std::shared_ptr<Tensor<float>>> zs(stages);
for (uint32_t stage = 0; stage < stages; ++stage) {
const std::vector<sftensor> stage_output = stage_outputs.at(stage);
const uint32_t nx_ = stage_output.front()->rows();
const uint32_t ny_ = stage_output.front()->cols();
for (uint32_t i = 0; i < stage_output.size(); ++i) {
CHECK(stage_output.at(i)->rows() == nx_ &&
stage_output.at(i)->cols() == ny_);
}
std::shared_ptr<Tensor<float>> x_stages_tensor;
x_stages_tensor =
TensorCreate(batch_size, stages * nx_ * ny_, uint32_t(classes_info));
for (uint32_t b = 0; b < batch_size; ++b) {
// 遍历每一批次
const std::shared_ptr<Tensor<float>> &input = stage_output.at(b);
CHECK(input != nullptr && !input->empty());
const uint32_t nx = input->rows();
const uint32_t ny = input->cols();
// 将input张量reshape到对应的形状(stages, classes_info, ny*nx)
input->ReRawView({stages, uint32_t(classes_info), ny * nx});
const uint32_t size = input->size();
// x[i].sigmoid()
input->Transform(
[](const float value) { return 1.f / (1.f + expf(-value)); });
// .split(2, 2, self.nc + 1), 4)
arma::fmat &x_stages = x_stages_tensor->slice(b);
for (uint32_t s = 0; s < stages; ++s) {
x_stages.submat(ny * nx * s, 0, ny * nx * (s + 1) - 1,
classes_info - 1) = input->slice(s).t();
}
// xy = (xy * 2 + self.grid[i]) * self.stride[i]
const arma::fmat &xy = x_stages.submat(0, 0, x_stages.n_rows - 1, 1);
// wh = (wh * 2) ** 2 * self.anchor_grid[i]
const arma::fmat &wh = x_stages.submat(0, 2, x_stages.n_rows - 1, 3);
// y = torch.cat((xy, wh, conf), 4)
x_stages.submat(0, 0, x_stages.n_rows - 1, 1) =
(xy * 2 + grids_[stage]) * strides_[stage];
x_stages.submat(0, 2, x_stages.n_rows - 1, 3) =
arma::pow((wh * 2), 2) % anchor_grids_[stage];
}
concat_rows += x_stages_tensor->rows();
// 一个stage(检测头)中所有批次的数据在处理完之后都会被放到x_stages和zs.at(stage)的位置
zs.at(stage) = x_stages_tensor;
}
// 将三个检测头的输出重新拼接起来,并存放到f1的位置
uint32_t current_rows = 0;
arma::fcube f1(concat_rows, classes_info, batch_size);
for (const auto &z : zs) {
f1.subcube(current_rows, 0, 0, current_rows + z->rows() - 1,
classes_info - 1, batch_size - 1) = z->data();
current_rows += z->rows();
}
for (int i = 0; i < f1.n_slices; ++i) {
std::shared_ptr<Tensor<float>> output = outputs.at(i);
if (output == nullptr || output->empty()) {
output = std::make_shared<Tensor<float>>(1, concat_rows, classes_info);
outputs.at(i) = output;
}
CHECK(output->rows() == f1.slice(i).n_rows);
CHECK(output->cols() == f1.slice(i).n_cols);
output->slice(0) = std::move(f1.slice(i));
}
return InferStatus::kInferSuccess;
}

