From b02b41e733740895e4b47b74ffc96b14bd9ee0c9 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 14 Jan 2026 14:15:41 +0800 Subject: [PATCH] pass --- CMakeLists.txt | 2 +- include/core/allocator.h | 5 +- src/core/allocator.cc | 99 +++++++++- src/core/graph.cc | 349 +++++++++++++++++++++++++++++++++++- src/operators/concat.cc | 18 ++ src/operators/matmul.cc | 39 +++- src/operators/transpose.cc | 23 ++- src/operators/unary.cc | 9 +- src/utils/operator_utils.cc | 28 ++- 9 files changed, 556 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 836a7e0..a1d5e4d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ # Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them. option(BUILD_TEST "Build tests" OFF) -cmake_minimum_required(VERSION 3.17) +cmake_minimum_required(VERSION 3.15) include(CMakeDependentOption) project(InfiniTensor C CXX) diff --git a/include/core/allocator.h b/include/core/allocator.h index 002601d..732e795 100644 --- a/include/core/allocator.h +++ b/include/core/allocator.h @@ -6,8 +6,6 @@ #endif #include #include -#include - namespace infini { class Allocator { @@ -27,7 +25,8 @@ namespace infini { // TODO:可能需要设计一个数据结构来存储free block,以便于管理和合并 // HINT: 可以使用一个 map 来存储 free block,key 为 block 的起始/结尾地址,value 为 block 的大小 // =================================== 作业 =================================== - + // key: free block start offset, value: free block size + std::map freeBlocks; public: Allocator(Runtime runtime); diff --git a/src/core/allocator.cc b/src/core/allocator.cc index ff593ae..14bc9c2 100644 --- a/src/core/allocator.cc +++ b/src/core/allocator.cc @@ -30,10 +30,51 @@ namespace infini size = this->getAlignedSize(size); // =================================== 作业 =================================== - // TODO: 设计一个算法来分配内存,返回起始地址偏移量 + // 采用 free-list + 合并 的方式进行模拟分配: + // 1) 优先从空闲块中找可用块(best-fit,减少碎片) + // 2) 若没有合适空闲块,则从末尾 bump 分配 + // 返回分配块的起始 offset // =================================== 作业 =================================== + size_t bestStart = 0; + size_t bestSize = 0; + bool found = false; + for (const auto &kv : freeBlocks) + { + const size_t start = kv.first; + const size_t blkSize = kv.second; + if (blkSize < size) + continue; + if (!found || blkSize < bestSize) + { + bestStart = start; + bestSize = blkSize; + found = true; + if (blkSize == size) + break; + } + } + + if (found) + { + auto it = freeBlocks.find(bestStart); + IT_ASSERT(it != freeBlocks.end()); + if (bestSize == size) + { + freeBlocks.erase(it); + } + else + { + freeBlocks.erase(it); + freeBlocks.emplace(bestStart + size, bestSize - size); + } + return bestStart; + } - return 0; + const size_t offset = this->used; + this->used += size; + if (this->used > this->peak) + this->peak = this->used; + return offset; } void Allocator::free(size_t addr, size_t size) @@ -42,8 +83,60 @@ namespace infini size = getAlignedSize(size); // =================================== 作业 =================================== - // TODO: 设计一个算法来回收内存 + // 回收逻辑: + // 1) 若释放的是末尾块,直接回退 used,并持续吞并末尾相邻的空闲块 + // 2) 否则插入 freeBlocks,并与前后相邻空闲块合并 // =================================== 作业 =================================== + IT_ASSERT(size > 0); + IT_ASSERT(addr + size <= this->used); + + // Case 1: free at the end -> shrink + if (addr + size == this->used) + { + this->used = addr; + // continue shrinking if there are free blocks at the new end + while (true) + { + if (freeBlocks.empty()) + break; + auto it = freeBlocks.upper_bound(this->used); + if (it == freeBlocks.begin()) + break; + --it; + const size_t start = it->first; + const size_t blkSize = it->second; + if (start + blkSize != this->used) + break; + this->used = start; + freeBlocks.erase(it); + } + return; + } + + // Case 2: insert + coalesce + size_t newStart = addr; + size_t newSize = size; + + auto next = freeBlocks.lower_bound(newStart); + if (next != freeBlocks.begin()) + { + auto prev = std::prev(next); + if (prev->first + prev->second == newStart) + { + newStart = prev->first; + newSize += prev->second; + freeBlocks.erase(prev); + } + } + + next = freeBlocks.lower_bound(newStart); + if (next != freeBlocks.end() && newStart + newSize == next->first) + { + newSize += next->second; + freeBlocks.erase(next); + } + + freeBlocks.emplace(newStart, newSize); } void *Allocator::getPtr() diff --git a/src/core/graph.cc b/src/core/graph.cc index 3a90637..503b3d2 100644 --- a/src/core/graph.cc +++ b/src/core/graph.cc @@ -1,7 +1,11 @@ #include "core/graph.h" +#include "operators/matmul.h" +#include "operators/transpose.h" #include #include #include +#include +#include namespace infini { @@ -106,6 +110,267 @@ namespace infini // 1. 去除冗余的算子(例如,两个相邻的算子都是 transpose 算子,且做的是相反的操作,可以将其全部删除) // 2. 合并算子(例如,矩阵乘算子中含有属性transA、transB,如果其输入存在transpose,且对最后两个维度做交换,就可以将transpose融入到矩阵乘算子的属性中去) // =================================== 作业 =================================== + + IT_ASSERT(topo_sort() == true); + + auto is_swap_last_two = [](const vector &perm, int rank) -> bool + { + if (rank < 2) + return false; + if ((int)perm.size() != rank) + return false; + for (int i = 0; i < rank - 2; ++i) + if (perm[i] != i) + return false; + return perm[rank - 2] == rank - 1 && perm[rank - 1] == rank - 2; + }; + + auto is_inverse_perm = [](const vector &p, const vector &q, + int rank) -> bool + { + if ((int)p.size() != rank || (int)q.size() != rank) + return false; + vector inv(rank, -1); + for (int i = 0; i < rank; ++i) + { + IT_ASSERT(p[i] >= 0 && p[i] < rank); + inv[p[i]] = i; + } + for (int i = 0; i < rank; ++i) + { + if (inv[i] != q[i]) + return false; + } + return true; + }; + + auto cleanup_dangling_tensors = [&]() + { + for (auto it = tensors.begin(); it != tensors.end();) + { + auto &t = *it; + if (!t) + { + it = tensors.erase(it); + continue; + } + if (t->targets.empty() && t->source.expired()) + it = tensors.erase(it); + else + ++it; + } + }; + + auto detach_op = [&](const Operator &op) + { + // Disconnect from predecessor/successor bookkeeping and tensor edges. + for (const auto &in : op->inputs) + { + if (!in) + continue; + in->removeTarget(op); + } + for (const auto &out : op->outputs) + { + if (!out) + continue; + if (out->source.lock() == op) + out->source.reset(); + } + + for (const auto &pred : op->getPredecessors()) + pred->removeSuccessors(op); + for (const auto &succ : op->getSuccessors()) + succ->removePredecessors(op); + + op->predecessors.clear(); + op->successors.clear(); + }; + + bool changed = false; + do + { + changed = false; + + std::unordered_set eraseOps; + + // Rule 2: fuse Transpose(swapping last two dims) into Matmul's transA/transB. + for (size_t opIdx = 0; opIdx < ops.size(); ++opIdx) + { + auto op = ops[opIdx]; + if (!op || eraseOps.count(op.get())) + continue; + if (op->getOpType() != OpType::MatMul) + continue; + auto mm = std::dynamic_pointer_cast(op); + if (!mm) + continue; + + for (int inputIdx = 0; inputIdx < 2; ++inputIdx) + { + auto in = op->inputs[inputIdx]; + if (!in) + continue; + auto pred = in->getSource(); + if (!pred || eraseOps.count(pred.get()) || + pred->getOpType() != OpType::Transpose) + continue; + + // Only safe to fuse if transpose output is used only by this matmul. + if (in->getTargets().size() != 1) + continue; + + auto tp = std::dynamic_pointer_cast(pred); + if (!tp) + continue; + const auto perm = tp->getPermute(); + const int rank = static_cast(in->getRank()); + if (!is_swap_last_two(perm, rank)) + continue; + + auto orig = pred->inputs[0]; + if (!orig) + continue; + + // Rewire matmul to consume transpose input directly. + op->replaceInput(in, orig); + in->removeTarget(op); + orig->addTarget(op); + + // Update predecessor/successor relation. + op->removePredecessors(pred); + pred->removeSuccessors(op); + if (auto origPred = orig->getSource()) + { + origPred->addSuccessors(op); + op->addPredecessors(origPred); + } + + // Toggle trans flag. + if (inputIdx == 0) + mm->setTransA(!mm->getTransA()); + else + mm->setTransB(!mm->getTransB()); + + // If transpose becomes unused, remove it. + if (in->getTargets().empty()) + { + detach_op(pred); + eraseOps.insert(pred.get()); + changed = true; + } + } + } + + if (!eraseOps.empty()) + { + ops.erase(std::remove_if(ops.begin(), ops.end(), + [&](const Operator &op) + { + return !op || eraseOps.count(op.get()); + }), + ops.end()); + } + + // Rule 1: remove adjacent inverse Transpose pairs. + for (auto it = ops.begin(); it != ops.end();) + { + auto &op1 = *it; + if (op1->getOpType() != OpType::Transpose) + { + ++it; + continue; + } + auto t1 = std::dynamic_pointer_cast(op1); + if (!t1) + { + ++it; + continue; + } + auto y = op1->outputs[0]; + if (!y || y->getTargets().size() != 1) + { + ++it; + continue; + } + auto op2 = y->getTargets()[0]; + if (!op2 || op2->getOpType() != OpType::Transpose) + { + ++it; + continue; + } + auto t2 = std::dynamic_pointer_cast(op2); + if (!t2) + { + ++it; + continue; + } + auto z = op2->outputs[0]; + if (!z) + { + ++it; + continue; + } + // Skip if z is a graph output (no targets) since we cannot safely + // replace external tensor references. + if (z->getTargets().empty()) + { + ++it; + continue; + } + + auto x = op1->inputs[0]; + if (!x) + { + ++it; + continue; + } + + const auto p1 = t1->getPermute(); + const auto p2 = t2->getPermute(); + const int rank = static_cast(y->getRank()); + if (!is_inverse_perm(p1, p2, rank)) + { + ++it; + continue; + } + + // Rewire: replace uses of z with x. + auto succs = z->getTargets(); + for (auto &succ : succs) + { + succ->replaceInput(z, x); + z->removeTarget(succ); + x->addTarget(succ); + + succ->removePredecessors(op2); + op2->removeSuccessors(succ); + if (auto xp = x->getSource()) + { + xp->addSuccessors(succ); + succ->addPredecessors(xp); + } + } + + // Remove the two transpose ops and their dangling tensors. + detach_op(op1); + detach_op(op2); + ops.erase(std::remove(ops.begin(), ops.end(), op1), ops.end()); + ops.erase(std::remove(ops.begin(), ops.end(), op2), ops.end()); + cleanup_dangling_tensors(); + + changed = true; + // Restart since iterators invalidated. + it = ops.begin(); + } + + if (changed) + { + sorted = false; + IT_ASSERT(topo_sort() == true); + cleanup_dangling_tensors(); + } + } while (changed); } Tensor GraphObj::getTensor(int fuid) const @@ -147,18 +412,98 @@ namespace infini { // topological sorting first IT_ASSERT(topo_sort() == true); - // =================================== 作业 =================================== // TODO:利用 allocator 给计算图分配内存 // HINT: 获取分配好的内存指针后,可以调用 tensor 的 setDataBlob 函数给 tensor 绑定内存 // =================================== 作业 =================================== + // Pass 1: simulate allocation to compute offsets and peak memory. + std::unordered_map offsetMap; + std::unordered_map remainingUses; + std::unordered_set pinned; + + pinned.reserve(tensors.size()); + remainingUses.reserve(tensors.size()); + offsetMap.reserve(tensors.size()); + + // Pin graph inputs/outputs: keep their storage alive. + for (const auto &t : tensors) + { + if (!t) + continue; + if (!t->getSource() || t->getTargets().empty()) + pinned.insert(t.get()); + remainingUses.emplace(t.get(), t->getTargets().size()); + } + + // Allocate graph inputs first (they have no source op). + for (const auto &t : tensors) + { + if (!t) + continue; + if (!t->getSource()) + { + auto off = allocator.alloc(t->getBytes()); + offsetMap.emplace(t.get(), off); + } + } + + // Allocate outputs when produced; free intermediates after last use. + for (const auto &op : ops) + { + // Allocate op outputs + for (const auto &out : op->getOutputs()) + { + if (!out) + continue; + if (offsetMap.find(out.get()) == offsetMap.end()) + { + auto off = allocator.alloc(out->getBytes()); + offsetMap.emplace(out.get(), off); + } + } + + // Consume op inputs; free when no longer needed. + for (const auto &in : op->getInputs()) + { + if (!in) + continue; + auto *tp = in.get(); + if (pinned.find(tp) != pinned.end()) + continue; + auto it = remainingUses.find(tp); + IT_ASSERT(it != remainingUses.end()); + IT_ASSERT(it->second > 0); + it->second--; + if (it->second == 0) + { + auto offIt = offsetMap.find(tp); + IT_ASSERT(offIt != offsetMap.end()); + allocator.free(offIt->second, in->getBytes()); + } + } + } + + // Pass 2: allocate the real arena once, then bind each tensor's blob. + void *base = allocator.getPtr(); + for (const auto &t : tensors) + { + if (!t) + continue; + auto it = offsetMap.find(t.get()); + if (it == offsetMap.end()) + continue; + auto ptr = static_cast(static_cast(base) + it->second); + t->setDataBlob(make_ref(runtime, ptr)); + } + allocator.info(); } Tensor GraphObj::addTensor(Shape dim, DataType dtype) { - return tensors.emplace_back(make_ref(dim, dtype, runtime)); + tensors.emplace_back(make_ref(dim, dtype, runtime)); + return tensors.back(); } Tensor GraphObj::addTensor(const Tensor &tensor) diff --git a/src/operators/concat.cc b/src/operators/concat.cc index d196330..4b0472d 100644 --- a/src/operators/concat.cc +++ b/src/operators/concat.cc @@ -18,6 +18,24 @@ optional> ConcatObj::inferShape(const TensorVec &inputs) { // REF: https://onnx.ai/onnx/operators/onnx__Concat.html#concat-13 // =================================== 作业 =================================== + IT_ASSERT(!inputs.empty()); + IT_ASSERT(dim >= 0 && dim < (int)rank); + + int concatSize = dims[dim]; + for (size_t i = 1; i < inputs.size(); ++i) { + IT_ASSERT(inputs[i]->getRank() == rank); + const Shape other = inputs[i]->getDims(); + IT_ASSERT(other.size() == dims.size()); + + for (size_t axis = 0; axis < rank; ++axis) { + if ((int)axis == dim) + continue; + IT_ASSERT(other[axis] == dims[axis]); + } + concatSize += other[dim]; + } + dims[dim] = concatSize; + return {{dims}}; } diff --git a/src/operators/matmul.cc b/src/operators/matmul.cc index 7a16ca2..1c66d87 100644 --- a/src/operators/matmul.cc +++ b/src/operators/matmul.cc @@ -1,4 +1,5 @@ #include "operators/matmul.h" +#include "utils/operator_utils.h" namespace infini { @@ -27,7 +28,43 @@ namespace infini // TODO:返回经过 matmul 操作后的 shape // REF: https://github.com/onnx/onnx/blob/main/docs/Operators.md#gemm // =================================== 作业 =================================== - return std::nullopt; + + IT_ASSERT(inputs.size() == 2); + const auto A = inputs[0]; + const auto B = inputs[1]; + + const Shape dimsA = A->getDims(); + const Shape dimsB = B->getDims(); + const int rankA = static_cast(dimsA.size()); + const int rankB = static_cast(dimsB.size()); + IT_ASSERT(rankA >= 2 && rankB >= 2); + + const int a0 = dimsA[rankA - 2]; + const int a1 = dimsA[rankA - 1]; + const int b0 = dimsB[rankB - 2]; + const int b1 = dimsB[rankB - 1]; + + const int m_ = transA ? a1 : a0; + const int kA = transA ? a0 : a1; + const int kB = transB ? b1 : b0; + const int n_ = transB ? b0 : b1; + + IT_ASSERT(kA == kB); + + m = m_; + n = n_; + k = kA; + + Shape batchA, batchB; + if (rankA > 2) + batchA = Shape(dimsA.begin(), dimsA.end() - 2); + if (rankB > 2) + batchB = Shape(dimsB.begin(), dimsB.end() - 2); + + Shape out = infer_broadcast(batchA, batchB); + out.push_back(m); + out.push_back(n); + return {{out}}; } } // namespace infini \ No newline at end of file diff --git a/src/operators/transpose.cc b/src/operators/transpose.cc index faab2b6..0e1ff17 100644 --- a/src/operators/transpose.cc +++ b/src/operators/transpose.cc @@ -34,7 +34,28 @@ namespace infini // REF: https://onnx.ai/onnx/operators/onnx__Transpose.html#transpose-21 // =================================== 作业 =================================== - return std::nullopt; + IT_ASSERT(static_cast(input_dim.size()) == rank); + + // If perm is not provided, ONNX default is reversing the dimensions. + vector perm = transposePermute; + if (perm.empty()) { + perm.resize(rank); + for (int i = 0; i < rank; ++i) + perm[i] = rank - 1 - i; + } + + IT_ASSERT(static_cast(perm.size()) == rank); + + vector seen(rank, 0); + for (int outAxis = 0; outAxis < rank; ++outAxis) + { + int inAxis = perm[outAxis]; + IT_ASSERT(inAxis >= 0 && inAxis < rank); + IT_ASSERT(++seen[inAxis] == 1); + output_dim[outAxis] = input_dim[inAxis]; + } + + return {{output_dim}}; } std::string TransposeObj::toString() const diff --git a/src/operators/unary.cc b/src/operators/unary.cc index 3daad36..8cf63f4 100644 --- a/src/operators/unary.cc +++ b/src/operators/unary.cc @@ -39,7 +39,8 @@ namespace infini // TODO:返回经过 clip 操作后的 shape // REF: https://onnx.ai/onnx/operators/onnx__Clip.html#clip-13 // =================================== 作业 =================================== - return std::nullopt; + const auto X = inputs[0]; + return {{X->getDims()}}; } std::string ClipObj::toString() const @@ -66,7 +67,8 @@ namespace infini // REF_FILE: src/core/operator.cc // REF: https://onnx.ai/onnx/operators/onnx__Cast.html#cast-21 // =================================== 作业 =================================== - return {}; + (void)inputs; + return {getOutputDataType()}; } optional> CastObj::inferShape(const TensorVec &inputs) @@ -75,7 +77,8 @@ namespace infini // TODO:返回经过 cast 操作后的 shape // REF: https://onnx.ai/onnx/operators/onnx__Cast.html#cast-21 // =================================== 作业 =================================== - return std::nullopt; + const auto X = inputs[0]; + return {{X->getDims()}}; } std::string CastObj::toString() const diff --git a/src/utils/operator_utils.cc b/src/utils/operator_utils.cc index edbd2c8..bb0dd8a 100644 --- a/src/utils/operator_utils.cc +++ b/src/utils/operator_utils.cc @@ -9,8 +9,32 @@ Shape infer_broadcast(const Shape &A, const Shape &B) { // TODO:对 A 和 B 进行双向广播,返回广播后的形状。 // REF: https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md // =================================== 作业 =================================== - - return {}; + + const size_t rankA = A.size(); + const size_t rankB = B.size(); + const size_t rank = std::max(rankA, rankB); + + Shape out(rank, 1); + + // Align dimensions from the trailing axis (like NumPy/ONNX broadcasting). + for (size_t i = 0; i < rank; ++i) { + const size_t aIdx = (i < rank - rankA) ? (size_t)-1 : (i - (rank - rankA)); + const size_t bIdx = (i < rank - rankB) ? (size_t)-1 : (i - (rank - rankB)); + + const int aDim = (aIdx == (size_t)-1) ? 1 : A[aIdx]; + const int bDim = (bIdx == (size_t)-1) ? 1 : B[bIdx]; + + if (aDim == bDim) + out[i] = aDim; + else if (aDim == 1) + out[i] = bDim; + else if (bDim == 1) + out[i] = aDim; + else + IT_ASSERT(false, "Broadcast failed: incompatible dimensions"); + } + + return out; } int get_real_axis(const int &axis, const int &rank) {