自制深度学习框架--再探Tensor类并构建计算图的图关系

再探Tensor类

在之前，我们实现的张量初始化是这样的。

1
2
3

Tensor<float>::Tensor(uint32_t channels, uint32_t rows, uint32_t cols) {
  data_ = arma::fcube(rows, cols, channels);
}

这个Tensor类其实并不能满足我们的使用需要，因为我们有些时候数据并不是三维的，
原来的Tensor不能在逻辑上区分当前的张量是三维的、二维的还是一维的，因为实际的数据存储类arma::fcube总是一个三维数据。
而且在之前我们也没有实现reshape。

所以，现在让我们一起来完善这个Tensor类吧。

Tensor<float>::Tensor(uint32_t channels, uint32_t rows, uint32_t cols) {
  // float cube(n_rows, n_cols, n_slices)
  data_ = arma::fcube(rows, cols, channels);
  if (channels == 1 && rows == 1) {
    this->raw_shapes_ = std::vector<uint32_t>{cols};
  } else if (channels == 1) {
    this->raw_shapes_ = std::vector<uint32_t>{rows, cols};
  } else {
    this->raw_shapes_ = std::vector<uint32_t>{channels, rows, cols};
  }
}

在这里，我们调用arma::fcube来初始化data，
同时raw_shape记录的是另外一个方面的形状信息，主要用于review和flatten层中。
尽管实际的数据存储类arma::fcube总是一个三维数据，但是逻辑上用raw_shapes来记录当前的张量是三维的、二维的还是一维的。

列优先的Reshape

void Tensor<float>::ReRawshape(const std::vector<uint32_t>& shapes) {
  CHECK(!this->data_.empty());
  CHECK(!shapes.empty());
  const uint32_t origin_size = this->size();
  uint32_t current_size = 1;
  for (uint32_t s : shapes) {
    current_size *= s;
  }
  CHECK(shapes.size() <= 3);
  CHECK(current_size == origin_size);

  // cube.reshape( n_rows, n_cols, n_slices )
  if (shapes.size() == 3) {
    // shapes = {channels, rows, cols}
    this->data_.reshape(shapes.at(1), shapes.at(2), shapes.at(0));
    this->raw_shapes_ = {shapes.at(0), shapes.at(1), shapes.at(2)};
  } else if (shapes.size() == 2) {
    // shapes = {rows, cols}
    this->data_.reshape(shapes.at(0), shapes.at(1), 1);
    this->raw_shapes_ = {shapes.at(0), shapes.at(1)};
  } else {
    // shapes = {cols}
    this->data_.reshape(shapes.at(0), 1, 1);
    this->raw_shapes_ = {shapes.at(0)};
  }
}

在这里调用了armadillo::cube.reshape，
由于armadillo::cube是一个列优先的容器，所以Reshape的方式是列优先的。

行优先的Reshape

void Tensor<float>::ReView(const std::vector<uint32_t>& shapes) {
  CHECK(!this->data_.empty());
  // shapes = {channels, rows, cols}
  const uint32_t target_channels = shapes.at(0);
  const uint32_t target_rows = shapes.at(1);
  const uint32_t target_cols = shapes.at(2);
  // 初始化new_data用于存放reshape后的结果
  arma::fcube new_data(target_rows, target_cols, target_channels);

  const uint32_t plane_size = target_rows * target_cols;
  for (uint32_t c = 0; c < this->data_.n_slices; ++c) {
    // 逐通道遍历，取出每一通道的数据
    const arma::fmat& channel = this->data_.slice(c);
    for (uint32_t c_ = 0; c_ < this->data_.n_cols; ++c_) {
      // 逐列遍历
      const float* colptr = channel.colptr(c_);
      for (uint32_t r = 0; r < this->data_.n_rows; ++r) {
        // 逐行遍历
        // 当前元素的位置=c*通道+r*列+c_*行
        const uint32_t pos_index =
            c * data_.n_rows * data_.n_cols + r * data_.n_cols + c_;
        // 计算reshape后的位置
        const uint32_t ch = pos_index / plane_size;
        const uint32_t row = (pos_index - ch * plane_size) / target_cols;
        const uint32_t col = (pos_index - ch * plane_size - row * target_cols);
        // 拷贝
        new_data.at(row, col, ch) = *(colptr + r);
      }
    }
  }
  this->data_ = new_data;
}

构建计算图的图关系

在之前的计算图初始化中RuntimeGraph::Init()，我们并没有构建计算图的图关系。

// 构建图关系
for (const auto &current_op : this->operators_) {
  // 当前算子的后一层算子的名字
  const std::vector<std::string> &output_names = current_op->output_names;
  // 遍历operators_
  for (const auto &next_op : this->operators_) {
    // 如果遍历到当前节点，跳到下一轮
    if (next_op == current_op) {
      continue;
    }
    // 如果遇到next_op的name和当前current_op->output_name是一致的
    if (std::find(output_names.begin(), output_names.end(), next_op->name) !=
        output_names.end()) {
      // 将next_op插入到current_op的下一个节点当中
      current_op->output_operators.insert({next_op->name, next_op});
    }
  }
}

计算图初始化完成后，接下来我们需要做的事情是找到op list(this->operators)中的输入和输出节点
总所周知，一个图一定有一个输入和输出。打个比方，
图的执行好像在走迷宫，就好像我们走迷宫之前需要先指定迷宫的输入输出位置。

void RuntimeGraph::Build(const std::string &input_name, const std::string &output_name) {
  // 如何计算图没有初始化，就初始化
  if (graph_state_ == GraphState::NeedInit) {
    bool init_graph = Init();
    LOG_IF(FATAL, !init_graph) << "Init graph failed!";
  }

  CHECK(graph_state_ >= GraphState::NeedBuild)
          << "Graph status error, current state is " << int(graph_state_);
  LOG_IF(FATAL, this->operators_.empty())
          << "Graph operators is empty, may be no init";

  this->input_operators_maps_.clear();
  this->output_operators_maps_.clear();

  // 遍历operators
  for (const auto &kOperator : this->operators_) {
    if (kOperator->type == "pnnx.Input") {
      // 找到this->operators中的输入节点
      this->input_operators_maps_.insert({kOperator->name, kOperator});
    } else if (kOperator->type == "pnnx.Output") {
      // 找到this->operators中的输出节点，目前只是支持一个输出
      this->output_operators_maps_.insert({kOperator->name, kOperator});
    } else {
      // 以后的课中加layer的
    }
  }

  // 初始化每个算子的输入和输出空间
  RuntimeGraphShape::InitOperatorInputTensor(operators_);
  RuntimeGraphShape::InitOperatorOutputTensor(graph_->ops, operators_);
  graph_state_ = GraphState::Complete;
  input_name_ = input_name;
  output_name_ = output_name;
}

初始化各算子的输入和输出空间

class RuntimeGraphShape {
 public:
  /**
   * 如果图是第一次运行，则根据节点输入operand的形状准备好后续Layer计算中所需要的Tensor
   * 如果图是第二次以上运行，则检查输入operand的形状和operand中张量的形状是否匹配
   * @param operators 计算图中的计算节点
   */
  static void InitOperatorInputTensor(const std::vector<std::shared_ptr<RuntimeOperator>> &operators);

  /**
   * 如果图是第一次运行，则根据节点输出operand的形状准备好后续Layer计算中所需要的Tensor
   * 如果图是第二次以上运行，则检查输出operand的形状和operand中张量的形状是否匹配
   * @param pnnx_operators pnnx图节点
   * @param operators KuiperInfer计算图中的计算节点
   */
  static void InitOperatorOutputTensor(const std::vector<pnnx::Operator *> &pnnx_operators,
                                       const std::vector<std::shared_ptr<RuntimeOperator>> &operators);
};

void RuntimeGraphShape::InitOperatorInputTensor(
    const std::vector<std::shared_ptr<RuntimeOperator>> &operators) {
  if (operators.empty()) {
    LOG(ERROR) << "Operators for init input shapes is empty!";
    return;
  }
  for (const auto &op : operators) {
    // 遍历所有的operators
    if (op->input_operands.empty()) {
      continue;
    } else {
      const std::map<std::string, std::shared_ptr<RuntimeOperand>> &
          input_operands_map = op->input_operands;
      for (const auto &input_operand_iter : input_operands_map) {
        // 遍历该operator对应的input_operands
        const auto &input_operand = input_operand_iter.second;  // 键值对中的value

        const auto &type = input_operand->type;
        CHECK(type == RuntimeDataType::kTypeFloat32)
                << "The graph only support float32 yet!";
        const auto &input_operand_shape = input_operand->shapes;
        auto &input_datas = input_operand->datas;

        CHECK(!input_operand_shape.empty());
        const int32_t batch = input_operand_shape.at(0);  // 得到批次大小
        CHECK(batch >= 0) << "Dynamic batch size is not supported!";
        CHECK(input_operand_shape.size() == 2 ||
            input_operand_shape.size() == 4 ||
            input_operand_shape.size() == 3)
                << "Unsupported tensor shape sizes: " << input_operand_shape.size();

        if (!input_datas.empty()) {
          // 如果数据非空
          CHECK(input_datas.size() == batch) << "Batch size is wrong!";
          for (int32_t i = 0; i < batch; ++i) {
            // 遍历所有批次，做形状检查，避免第二次的初始化
            const std::vector<uint32_t> &input_data_shape =
                input_datas.at(i)->shapes();
            CHECK(input_data_shape.size() == 3)
                    << "THe origin shape size of operator input data do not equals "
                       "to three";
            if (input_operand_shape.size() == 4) {
              CHECK(input_data_shape.at(0) == input_operand_shape.at(1) &&
                  input_data_shape.at(1) == input_operand_shape.at(2) &&
                  input_data_shape.at(2) == input_operand_shape.at(3));
            } else if (input_operand_shape.size() == 2) {
              CHECK(input_data_shape.at(1) == input_operand_shape.at(1) &&
                  input_data_shape.at(0) == 1 && input_data_shape.at(2) == 1);
            } else {
              // current shape size = 3
              CHECK(input_data_shape.at(1) == input_operand_shape.at(1) &&
                  input_data_shape.at(0) == 1 &&
                  input_data_shape.at(2) == input_operand_shape.at(2));
            }
          }
        } else {
          // 输入数据是空的
          input_datas.resize(batch);
          for (int32_t i = 0; i < batch; ++i) {
            // 遍历所有批次，初始化张量
            if (input_operand_shape.size() == 4) {
              input_datas.at(i) = std::make_shared<Tensor<float>>(
                  input_operand_shape.at(1), input_operand_shape.at(2),
                  input_operand_shape.at(3));
            } else if (input_operand_shape.size() == 2) {
              input_datas.at(i) = std::make_shared<Tensor<float>>(
                  1, input_operand_shape.at(1), 1);
            } else {
              // current shape is 3
              input_datas.at(i) = std::make_shared<Tensor<float>>(
                  1, input_operand_shape.at(1), input_operand_shape.at(2));
            }
          }
        }
      }
    }
  }
}

void RuntimeGraphShape::InitOperatorOutputTensor(
    const std::vector<pnnx::Operator *> &pnnx_operators,
    const std::vector<std::shared_ptr<RuntimeOperator>> &operators) {
  CHECK(!pnnx_operators.empty() && !operators.empty());
  CHECK(pnnx_operators.size() == operators.size());
  for (uint32_t i = 0; i < pnnx_operators.size(); ++i) {
    // 遍历所有的pnnx_operators
    const std::vector<pnnx::Operand *> operands = pnnx_operators.at(i)->outputs;
    CHECK(operands.size() <= 1) << "Only support one node one output yet!";
    if (operands.empty()) {
      continue;
    }
    // 如果operands的非空的
    CHECK(operands.size() == 1) << "Only support one output in the KuiperInfer";
    pnnx::Operand *operand = operands.front();
    const auto &runtime_op = operators.at(i);
    CHECK(operand != nullptr) << "Operand output is null";
    const std::vector<int32_t> &operand_shapes = operand->shape;
    const auto &output_tensors = runtime_op->output_operands;

    const int32_t batch = operand_shapes.at(0);
    CHECK(batch >= 0) << "Dynamic batch size is not supported!";
    CHECK(operand_shapes.size() == 2 || operand_shapes.size() == 4 ||
        operand_shapes.size() == 3)
            << "Unsupported shape sizes: " << operand_shapes.size();

    if (!output_tensors) {
      // 如果output_operands是空的，初始化输出张量
      std::shared_ptr<RuntimeOperand> output_operand =
          std::make_shared<RuntimeOperand>();
      output_operand->shapes = operand_shapes;
      output_operand->type = RuntimeDataType::kTypeFloat32;
      output_operand->name = operand->name + "_output";
      for (int j = 0; j < batch; ++j) {
        // 遍历每一批次的数据，放到output_operand
        if (operand_shapes.size() == 4) {
          output_operand->datas.push_back(std::make_shared<Tensor<float>>(
              operand_shapes.at(1), operand_shapes.at(2),
              operand_shapes.at(3)));
        } else if (operand_shapes.size() == 2) {
          output_operand->datas.push_back(
              std::make_shared<Tensor<float>>(1, operand_shapes.at(1), 1));
        } else {
          // current shape is 3
          output_operand->datas.push_back(std::make_shared<Tensor<float>>(
              1, operand_shapes.at(1), operand_shapes.at(2)));
        }
      }
      runtime_op->output_operands = std::move(output_operand);
    } else {
      // 如果output_operands是非空的
      CHECK(batch == output_tensors->datas.size());
      // output_tensors empty
      CHECK(output_tensors->type == RuntimeDataType::kTypeFloat32);
      CHECK(output_tensors->shapes == operand_shapes);
      for (uint32_t b = 0; b < batch; ++b) {
        // 遍历每一批次的数据，做形状检查，如果形状不对，则reshape
        const std::vector<uint32_t> &tensor_shapes =
            output_tensors->datas.at(b)->shapes();
        if (operand_shapes.size() == 4) {
          if (tensor_shapes.at(0) != operand_shapes.at(1) ||
              tensor_shapes.at(1) != operand_shapes.at(2) ||
              tensor_shapes.at(2) != operand_shapes.at(3)) {
            DLOG(WARNING) << "The shape of tensor do not adapting with output operand";
            const auto &target_shapes = std::vector<uint32_t>{(uint32_t) operand_shapes.at(1),
                                                              (uint32_t) operand_shapes.at(2),
                                                              (uint32_t) operand_shapes.at(3)};
            output_tensors->datas.at(b)->ReRawshape(target_shapes);
          }
        } else if (operand_shapes.size() == 2) {
          if (tensor_shapes.at(0) != 1 ||
              tensor_shapes.at(1) != operand_shapes.at(1) ||
              tensor_shapes.at(2) != 1) {
            DLOG(WARNING) << "The shape of tensor do not adapting with output operand";
            const auto &target_shapes = std::vector<uint32_t>{1, (uint32_t) operand_shapes.at(1), 1};
            output_tensors->datas.at(b)->ReRawshape(target_shapes);
          }
        } else {
          // current shape is 3
          if (tensor_shapes.at(0) != 1 ||
              tensor_shapes.at(1) != operand_shapes.at(1) ||
              tensor_shapes.at(2) != operand_shapes.at(2)) {
            DLOG(WARNING) << "The shape of tensor do not adapting with output operand";
            const auto &target_shapes =
                std::vector<uint32_t>{1, (uint32_t) operand_shapes.at(1), (uint32_t) operand_shapes.at(2)};
            output_tensors->datas.at(b)->ReRawshape(target_shapes);
          }
        }
      }
    }
  }
}