自制深度学习框架--实现Yolov5的推理

为了实现Yolov5的推理，我们需要在source/layer/details中实现SiLU、Concat、UpSample和YoloDetect。
详细请看：https://blog.csdn.net/qq_32901731/article/details/129710271

SiLU

$SiLU(x)=\frac{x}{1+e^{-x}}$

相当于sigmoid函数的乘以x。

InferStatus SiLULayer::Forward(const std::vector<std::shared_ptr<Tensor<float>>> &inputs,
                               std::vector<std::shared_ptr<Tensor<float>>> &outputs) {
  if (inputs.empty()) {
    LOG(ERROR) << "The input feature map of silu layer is empty";
    return InferStatus::kInferFailedInputEmpty;
  }

  if (inputs.size() != outputs.size()) {
    LOG(ERROR) << "The input and output size of silu layer is not adapting";
    return InferStatus::kInferFailedInputOutSizeAdaptingError;
  }

  const uint32_t batch_size = inputs.size();
#pragma omp parallel for num_threads(batch_size)
  for (uint32_t i = 0; i < batch_size; ++i) {
    // 遍历每一批次
    const std::shared_ptr<Tensor<float>> &input = inputs.at(i);
    CHECK(input == nullptr || !input->empty()) << "The input feature map of silu layer is empty!";

    std::shared_ptr<Tensor<float>> output = outputs.at(i);
    if (output == nullptr || output->empty()) {
      output = std::make_shared<Tensor<float>>(input->shapes());
      outputs.at(i) = output;
    }
    CHECK(output->shapes() == input->shapes()) << "The output size of silu layer is error";

    // 将input中的数据拷贝到output
    output->set_data(input->data());
    // SiLU
    output->Transform([](const float value) {
      return value / (1.f + expf(-value));
    });
  }
  return InferStatus::kInferSuccess;
}

Concat

将多个张量在通道维(channel dim)进行拼接

InferStatus CatLayer::Forward(
    const std::vector<std::shared_ptr<Tensor<float>>>& inputs,
    std::vector<std::shared_ptr<Tensor<float>>>& outputs) {
  if (inputs.empty()) {
    LOG(ERROR) << "The input feature map of cat layer is empty";
    return InferStatus::kInferFailedInputEmpty;
  }

  if (inputs.size() == outputs.size()) {
    LOG(ERROR) << "The input and output size is not adapting";
    return InferStatus::kInferFailedInputOutSizeAdaptingError;
  }

  if (dim_ != 1 && dim_ != -3) {
    LOG(ERROR) << "The dimension of cat layer is error";
    return InferStatus::kInferFailedDimensionParameterError;
  }

  const uint32_t output_size = outputs.size();
  CHECK(inputs.size() % output_size == 0);
  const uint32_t packet_size = inputs.size() / output_size;

#pragma omp parallel for num_threads(outputs.size())
  for (uint32_t i = 0; i < outputs.size(); ++i) {
    std::shared_ptr<Tensor<float>> output = outputs.at(i);
    uint32_t start_channel = 0;
    uint32_t rows = inputs.front()->rows();
    uint32_t cols = inputs.front()->cols();

    for (uint32_t j = i; j < inputs.size(); j += output_size) {
      const std::shared_ptr<Tensor<float>>& input = inputs.at(j);
      CHECK(input != nullptr && !input->empty())
          << "The input feature map of cat layer is empty";

      const uint32_t in_channels = input->channels();
      CHECK(rows == input->rows() && cols == input->cols());

      if (output == nullptr || output->empty()) {
        output = std::make_shared<Tensor<float>>(in_channels * packet_size,
                                                 rows, cols);
        outputs.at(i) = output;
      }

      // 检查output的通道数量等于input数组的数量乘以input的维度
      CHECK(output->channels() == in_channels * packet_size &&
            output->rows() == rows && output->cols() == cols);

      // 将逐个输入在output的通道维上拼接起来
      for (uint32_t c = 0; c < in_channels; ++c) {
        output->slice(start_channel + c) = input->slice(c);
      }
      start_channel += input->channels();
    }
  }
  return InferStatus::kInferSuccess;
}

UpSample

输入的大小(width和height)放大到指定的scale倍而已，放大的方法这里采用了nearest方法，
也就是通过复制最近点的值来进行上采样。

InferStatus UpSampleLayer::Forward(const std::vector<std::shared_ptr<Tensor<float>>> &inputs,
                                   std::vector<std::shared_ptr<Tensor<float>>> &outputs) {
  if (inputs.empty()) {
    LOG(ERROR) << "The input feature map of upsample layer is empty";
    return InferStatus::kInferFailedInputEmpty;
  }

  if (inputs.size() != outputs.size()) {
    LOG(ERROR) << "The input and output size is not adapting";
    return InferStatus::kInferFailedInputOutSizeAdaptingError;
  }

  for (uint32_t i = 0; i < inputs.size(); ++i) {
    const sftensor &input_data = inputs.at(i);
    if (input_data == nullptr || input_data->empty()) {
      LOG(ERROR) << "The input feature map of upsample layer is empty";
      return InferStatus::kInferFailedInputEmpty;
    }
  }

  LOG_IF(FATAL, this->mode_ != UpSampleMode::kModeNearest) << "Unsupported upsample mode: " << int(mode_);

  const uint32_t batch_size = inputs.size();
#pragma omp parallel for num_threads(batch_size)
  for (uint32_t i = 0; i < batch_size; ++i) {
    // 遍历每一批次
    const arma::fcube &input_data = inputs.at(i)->data();

    std::shared_ptr<Tensor<float>> output = outputs.at(i);
    if (output == nullptr || output->empty()) {
      output = std::make_shared<Tensor<float>>(input_data.n_slices,
                                               uint32_t(input_data.n_rows * scale_h_),
                                               uint32_t(input_data.n_cols * scale_w_));
      outputs.at(i) = output;
    }
    auto &output_data = output->data();
    // 检查output的空间是否放得下上采样后的输入
    CHECK(output_data.n_rows == input_data.n_rows * scale_h_) << "The height of the feature map is not adapting!";
    CHECK(output_data.n_cols == input_data.n_cols * scale_w_) << "The width of the feature map is not adapting!";
    CHECK(input_data.n_slices == output_data.n_slices) << "The channel of the feature map is not adapting!";

    const uint32_t channels = input_data.n_slices;
    for (uint32_t c = 0; c < channels; ++c) {
      // 遍历每一通道
      const arma::fmat &input_channel = input_data.slice(c);
      arma::fmat &output_channel = output_data.slice(c);
      const uint32_t output_w = output_channel.n_cols;
      const uint32_t output_h = output_channel.n_rows;

      for (uint32_t w = 0; w < output_w; ++w) {
        // 将output_channel上的坐标除以scale_w，得到它在输入input_channel上的坐标src_w
        const uint32_t src_w = uint32_t((float) w / this->scale_w_);
        CHECK(src_w < input_channel.n_cols);

        float *output_channel_ptr = output_channel.colptr(w);
        const float *input_channel_ptr = input_channel.colptr(src_w);

        for (uint32_t h = 0; h < output_h; ++h) {
          // 将output_channel上的坐标除以scale_h，得到它在输入input_channel上的坐标src_h
          const uint32_t src_h = uint32_t((float) h / this->scale_h_);
          CHECK(src_h < input_channel.n_rows);

          // 根据src_h和src_w位置的值来赋值
          const float src_value = *(input_channel_ptr + src_h);
          *(output_channel_ptr + h) = src_value;
        }
      }
    }
  }
  return InferStatus::kInferSuccess;
}

YoloDetect

YoloDetect的Python定义如下，直接摘录自YoloV5项目的yolo.py文件。

def forward(self, x):
    z = []  # inference output
    for i in range(self.nl):
        x[i] = self.m[i](x[i])  # conv
        bs, _, ny, nx = x[i].shape  # x(bs,255,20,20) to x(bs,3,20,20,85)
        x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()

        if not self.training:  # inference
            省略...
            else:  # Detect (boxes only)
                xy, wh, conf = x[i].sigmoid().split((2, 2, self.nc + 1), 4)
                xy = (xy * 2 + self.grid[i]) * self.stride[i]  # xy
                wh = (wh * 2) ** 2 * self.anchor_grid[i]  # wh
                y = torch.cat((xy, wh, conf), 4)
            z.append(y.view(bs, self.na * nx * ny, self.no))

YoloDetectLayer的定义

class YoloDetectLayer : public Layer {
 public:
  explicit YoloDetectLayer(int32_t stages,
                           int32_t num_classes,
                           const std::vector<float> &strides,
                           const std::vector<arma::fmat> &anchor_grids,
                           const std::vector<arma::fmat> &grids,
                           const std::vector<std::shared_ptr<ConvolutionLayer>> &conv_layers);

  InferStatus Forward(const std::vector<std::shared_ptr<Tensor<float>>> &inputs,
                      std::vector<std::shared_ptr<Tensor<float>>> &outputs) override;

  static ParseParameterAttrStatus GetInstance(const std::shared_ptr<RuntimeOperator> &op,
                                              std::shared_ptr<Layer> &yolo_detect_layer);
 private:
  int32_t stages_ = 0;
  int32_t num_classes_ = 0;
  std::vector<float> strides_;
  std::vector<arma::fmat> anchor_grids_;
  std::vector<arma::fmat> grids_;
  std::vector<std::shared_ptr<ConvolutionLayer>> conv_layers_;
};

YoloDetectLayer::Forward实现

InferStatus YoloDetectLayer::Forward(
    const std::vector<std::shared_ptr<Tensor<float>>> &inputs,
    std::vector<std::shared_ptr<Tensor<float>>> &outputs) {
  if (inputs.empty()) {
    LOG(ERROR) << "The input feature map of yolo detect layer is empty";
    return InferStatus::kInferFailedInputEmpty;
  }

  const uint32_t stages = stages_;
  const uint32_t classes_info = num_classes_ + 5;
  const uint32_t input_size = inputs.size();
  const uint32_t batch_size = outputs.size();

  if (input_size / batch_size != stages_ || input_size % batch_size != 0) {
    LOG(ERROR) << "The input and output number of yolo detect layer is wrong";
    return InferStatus::kInferFailedYoloStageNumberError;
  }

  CHECK(!this->conv_layers_.empty() && this->conv_layers_.size() == stages)
          << "The convolution layers in yolo detection layer is empty or do not "
             "have a correct number";

  std::vector<std::vector<std::shared_ptr<Tensor<float>>>> batches(stages);
  for (uint32_t i = 0; i < input_size; ++i) {
    const uint32_t index = i / batch_size;
    const auto &input_data = inputs.at(i);
    if (input_data == nullptr || input_data->empty()) {
      LOG(ERROR) << "The input feature map of yolo detect layer is empty";
      return InferStatus::kInferFailedInputEmpty;
    }
    CHECK(index <= batches.size());
    batches.at(index).push_back(input_data);
  }

  // conv
  std::vector<std::vector<sftensor>> stage_outputs(stages);
#pragma omp parallel for num_threads(stages)
  for (uint32_t stage = 0; stage < stages; ++stage) {
    const std::vector<std::shared_ptr<Tensor<float>>> &stage_input =
        batches.at(stage);

    CHECK(stage_input.size() == batch_size)
            << "The number of stage do not equal to batch size";

    std::vector<std::shared_ptr<Tensor<float>>> stage_output(batch_size);
    const auto status =
        this->conv_layers_.at(stage)->Forward(stage_input, stage_output);

    CHECK(status == InferStatus::kInferSuccess)
            << "Infer failed, error code: " << int(status);
    CHECK(stage_output.size() == batch_size)
            << "The number of stage output do not equal to batch size";
    stage_outputs.at(stage) = stage_output;
  }

  uint32_t concat_rows = 0;
  std::vector<std::shared_ptr<Tensor<float>>> zs(stages);
  for (uint32_t stage = 0; stage < stages; ++stage) {
    const std::vector<sftensor> stage_output = stage_outputs.at(stage);
    const uint32_t nx_ = stage_output.front()->rows();
    const uint32_t ny_ = stage_output.front()->cols();
    for (uint32_t i = 0; i < stage_output.size(); ++i) {
      CHECK(stage_output.at(i)->rows() == nx_ &&
          stage_output.at(i)->cols() == ny_);
    }

    std::shared_ptr<Tensor<float>> x_stages_tensor;
    x_stages_tensor =
        TensorCreate(batch_size, stages * nx_ * ny_, uint32_t(classes_info));

#pragma omp parallel for num_threads(batch_size)
    for (uint32_t b = 0; b < batch_size; ++b) {
      // 遍历每一批次
      const std::shared_ptr<Tensor<float>> &input = stage_output.at(b);
      CHECK(input != nullptr && !input->empty());
      const uint32_t nx = input->rows();
      const uint32_t ny = input->cols();
      // 将input张量reshape到对应的形状(stages, classes_info, ny*nx)
      input->ReRawView({stages, uint32_t(classes_info), ny * nx});
      const uint32_t size = input->size();

      // x[i].sigmoid()
      input->Transform(
          [](const float value) { return 1.f / (1.f + expf(-value)); });

      // .split(2, 2, self.nc + 1), 4)
      arma::fmat &x_stages = x_stages_tensor->slice(b);
      for (uint32_t s = 0; s < stages; ++s) {
        x_stages.submat(ny * nx * s, 0, ny * nx * (s + 1) - 1,
                        classes_info - 1) = input->slice(s).t();
      }

      // xy = (xy * 2 + self.grid[i]) * self.stride[i]
      const arma::fmat &xy = x_stages.submat(0, 0, x_stages.n_rows - 1, 1);
      // wh = (wh * 2) ** 2 * self.anchor_grid[i]
      const arma::fmat &wh = x_stages.submat(0, 2, x_stages.n_rows - 1, 3);

     // y = torch.cat((xy, wh, conf), 4)
      x_stages.submat(0, 0, x_stages.n_rows - 1, 1) =
          (xy * 2 + grids_[stage]) * strides_[stage];
      x_stages.submat(0, 2, x_stages.n_rows - 1, 3) =
          arma::pow((wh * 2), 2) % anchor_grids_[stage];
    }
    concat_rows += x_stages_tensor->rows();
    // 一个stage（检测头）中所有批次的数据在处理完之后都会被放到x_stages和zs.at(stage)的位置
    zs.at(stage) = x_stages_tensor;
  }

  // 将三个检测头的输出重新拼接起来，并存放到f1的位置
  uint32_t current_rows = 0;
  arma::fcube f1(concat_rows, classes_info, batch_size);
  for (const auto &z : zs) {
    f1.subcube(current_rows, 0, 0, current_rows + z->rows() - 1,
               classes_info - 1, batch_size - 1) = z->data();
    current_rows += z->rows();
  }

  for (int i = 0; i < f1.n_slices; ++i) {
    std::shared_ptr<Tensor<float>> output = outputs.at(i);
    if (output == nullptr || output->empty()) {
      output = std::make_shared<Tensor<float>>(1, concat_rows, classes_info);
      outputs.at(i) = output;
    }
    CHECK(output->rows() == f1.slice(i).n_rows);
    CHECK(output->cols() == f1.slice(i).n_cols);
    output->slice(0) = std::move(f1.slice(i));
  }
  return InferStatus::kInferSuccess;
}