From b02b41e733740895e4b47b74ffc96b14bd9ee0c9 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@localhost.localdomain>
Date: Wed, 14 Jan 2026 14:15:41 +0800
Subject: [PATCH] pass

---
 CMakeLists.txt              |   2 +-
 include/core/allocator.h    |   5 +-
 src/core/allocator.cc       |  99 +++++++++-
 src/core/graph.cc           | 349 +++++++++++++++++++++++++++++++++++-
 src/operators/concat.cc     |  18 ++
 src/operators/matmul.cc     |  39 +++-
 src/operators/transpose.cc  |  23 ++-
 src/operators/unary.cc      |   9 +-
 src/utils/operator_utils.cc |  28 ++-
 9 files changed, 556 insertions(+), 16 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 836a7e0..a1d5e4d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them.
 option(BUILD_TEST "Build tests" OFF)
 
-cmake_minimum_required(VERSION 3.17)
+cmake_minimum_required(VERSION 3.15)
 
 include(CMakeDependentOption)
 project(InfiniTensor C CXX)
diff --git a/include/core/allocator.h b/include/core/allocator.h
index 002601d..732e795 100644
--- a/include/core/allocator.h
+++ b/include/core/allocator.h
@@ -6,8 +6,6 @@
 #endif
 #include <cstddef>
 #include <map>
-#include <unordered_set>
-
 namespace infini {
   class Allocator
   {
@@ -27,7 +25,8 @@ namespace infini {
     // TODO：可能需要设计一个数据结构来存储free block，以便于管理和合并
     // HINT: 可以使用一个 map 来存储 free block，key 为 block 的起始/结尾地址，value 为 block 的大小
     // =================================== 作业 ===================================
-
+    // key: free block start offset, value: free block size
+    std::map<size_t, size_t> freeBlocks;
   public:
     Allocator(Runtime runtime);
 
diff --git a/src/core/allocator.cc b/src/core/allocator.cc
index ff593ae..14bc9c2 100644
--- a/src/core/allocator.cc
+++ b/src/core/allocator.cc
@@ -30,10 +30,51 @@ namespace infini
         size = this->getAlignedSize(size);
 
         // =================================== 作业 ===================================
-        // TODO: 设计一个算法来分配内存，返回起始地址偏移量
+        // 采用 free-list + 合并 的方式进行模拟分配：
+        // 1) 优先从空闲块中找可用块（best-fit，减少碎片）
+        // 2) 若没有合适空闲块，则从末尾 bump 分配
+        // 返回分配块的起始 offset
         // =================================== 作业 ===================================
+        size_t bestStart = 0;
+        size_t bestSize = 0;
+        bool found = false;
+        for (const auto &kv : freeBlocks)
+        {
+            const size_t start = kv.first;
+            const size_t blkSize = kv.second;
+            if (blkSize < size)
+                continue;
+            if (!found || blkSize < bestSize)
+            {
+                bestStart = start;
+                bestSize = blkSize;
+                found = true;
+                if (blkSize == size)
+                    break;
+            }
+        }
+
+        if (found)
+        {
+            auto it = freeBlocks.find(bestStart);
+            IT_ASSERT(it != freeBlocks.end());
+            if (bestSize == size)
+            {
+                freeBlocks.erase(it);
+            }
+            else
+            {
+                freeBlocks.erase(it);
+                freeBlocks.emplace(bestStart + size, bestSize - size);
+            }
+            return bestStart;
+        }
 
-        return 0;
+        const size_t offset = this->used;
+        this->used += size;
+        if (this->used > this->peak)
+            this->peak = this->used;
+        return offset;
     }
 
     void Allocator::free(size_t addr, size_t size)
@@ -42,8 +83,60 @@ namespace infini
         size = getAlignedSize(size);
 
         // =================================== 作业 ===================================
-        // TODO: 设计一个算法来回收内存
+        // 回收逻辑：
+        // 1) 若释放的是末尾块，直接回退 used，并持续吞并末尾相邻的空闲块
+        // 2) 否则插入 freeBlocks，并与前后相邻空闲块合并
         // =================================== 作业 ===================================
+        IT_ASSERT(size > 0);
+        IT_ASSERT(addr + size <= this->used);
+
+        // Case 1: free at the end -> shrink
+        if (addr + size == this->used)
+        {
+            this->used = addr;
+            // continue shrinking if there are free blocks at the new end
+            while (true)
+            {
+                if (freeBlocks.empty())
+                    break;
+                auto it = freeBlocks.upper_bound(this->used);
+                if (it == freeBlocks.begin())
+                    break;
+                --it;
+                const size_t start = it->first;
+                const size_t blkSize = it->second;
+                if (start + blkSize != this->used)
+                    break;
+                this->used = start;
+                freeBlocks.erase(it);
+            }
+            return;
+        }
+
+        // Case 2: insert + coalesce
+        size_t newStart = addr;
+        size_t newSize = size;
+
+        auto next = freeBlocks.lower_bound(newStart);
+        if (next != freeBlocks.begin())
+        {
+            auto prev = std::prev(next);
+            if (prev->first + prev->second == newStart)
+            {
+                newStart = prev->first;
+                newSize += prev->second;
+                freeBlocks.erase(prev);
+            }
+        }
+
+        next = freeBlocks.lower_bound(newStart);
+        if (next != freeBlocks.end() && newStart + newSize == next->first)
+        {
+            newSize += next->second;
+            freeBlocks.erase(next);
+        }
+
+        freeBlocks.emplace(newStart, newSize);
     }
 
     void *Allocator::getPtr()
diff --git a/src/core/graph.cc b/src/core/graph.cc
index 3a90637..503b3d2 100644
--- a/src/core/graph.cc
+++ b/src/core/graph.cc
@@ -1,7 +1,11 @@
 #include "core/graph.h"
+#include "operators/matmul.h"
+#include "operators/transpose.h"
 #include <algorithm>
 #include <numeric>
 #include <queue>
+#include <unordered_map>
+#include <unordered_set>
 
 namespace infini
 {
@@ -106,6 +110,267 @@ namespace infini
         // 1. 去除冗余的算子（例如，两个相邻的算子都是 transpose 算子，且做的是相反的操作，可以将其全部删除）
         // 2. 合并算子（例如，矩阵乘算子中含有属性transA、transB，如果其输入存在transpose，且对最后两个维度做交换，就可以将transpose融入到矩阵乘算子的属性中去）
         // =================================== 作业 ===================================
+
+        IT_ASSERT(topo_sort() == true);
+
+        auto is_swap_last_two = [](const vector<int> &perm, int rank) -> bool
+        {
+            if (rank < 2)
+                return false;
+            if ((int)perm.size() != rank)
+                return false;
+            for (int i = 0; i < rank - 2; ++i)
+                if (perm[i] != i)
+                    return false;
+            return perm[rank - 2] == rank - 1 && perm[rank - 1] == rank - 2;
+        };
+
+        auto is_inverse_perm = [](const vector<int> &p, const vector<int> &q,
+                                  int rank) -> bool
+        {
+            if ((int)p.size() != rank || (int)q.size() != rank)
+                return false;
+            vector<int> inv(rank, -1);
+            for (int i = 0; i < rank; ++i)
+            {
+                IT_ASSERT(p[i] >= 0 && p[i] < rank);
+                inv[p[i]] = i;
+            }
+            for (int i = 0; i < rank; ++i)
+            {
+                if (inv[i] != q[i])
+                    return false;
+            }
+            return true;
+        };
+
+        auto cleanup_dangling_tensors = [&]()
+        {
+            for (auto it = tensors.begin(); it != tensors.end();)
+            {
+                auto &t = *it;
+                if (!t)
+                {
+                    it = tensors.erase(it);
+                    continue;
+                }
+                if (t->targets.empty() && t->source.expired())
+                    it = tensors.erase(it);
+                else
+                    ++it;
+            }
+        };
+
+        auto detach_op = [&](const Operator &op)
+        {
+            // Disconnect from predecessor/successor bookkeeping and tensor edges.
+            for (const auto &in : op->inputs)
+            {
+                if (!in)
+                    continue;
+                in->removeTarget(op);
+            }
+            for (const auto &out : op->outputs)
+            {
+                if (!out)
+                    continue;
+                if (out->source.lock() == op)
+                    out->source.reset();
+            }
+
+            for (const auto &pred : op->getPredecessors())
+                pred->removeSuccessors(op);
+            for (const auto &succ : op->getSuccessors())
+                succ->removePredecessors(op);
+
+            op->predecessors.clear();
+            op->successors.clear();
+        };
+
+        bool changed = false;
+        do
+        {
+            changed = false;
+
+            std::unordered_set<OperatorObj *> eraseOps;
+
+            // Rule 2: fuse Transpose(swapping last two dims) into Matmul's transA/transB.
+            for (size_t opIdx = 0; opIdx < ops.size(); ++opIdx)
+            {
+                auto op = ops[opIdx];
+                if (!op || eraseOps.count(op.get()))
+                    continue;
+                if (op->getOpType() != OpType::MatMul)
+                    continue;
+                auto mm = std::dynamic_pointer_cast<MatmulObj>(op);
+                if (!mm)
+                    continue;
+
+                for (int inputIdx = 0; inputIdx < 2; ++inputIdx)
+                {
+                    auto in = op->inputs[inputIdx];
+                    if (!in)
+                        continue;
+                    auto pred = in->getSource();
+                    if (!pred || eraseOps.count(pred.get()) ||
+                        pred->getOpType() != OpType::Transpose)
+                        continue;
+
+                    // Only safe to fuse if transpose output is used only by this matmul.
+                    if (in->getTargets().size() != 1)
+                        continue;
+
+                    auto tp = std::dynamic_pointer_cast<TransposeObj>(pred);
+                    if (!tp)
+                        continue;
+                    const auto perm = tp->getPermute();
+                    const int rank = static_cast<int>(in->getRank());
+                    if (!is_swap_last_two(perm, rank))
+                        continue;
+
+                    auto orig = pred->inputs[0];
+                    if (!orig)
+                        continue;
+
+                    // Rewire matmul to consume transpose input directly.
+                    op->replaceInput(in, orig);
+                    in->removeTarget(op);
+                    orig->addTarget(op);
+
+                    // Update predecessor/successor relation.
+                    op->removePredecessors(pred);
+                    pred->removeSuccessors(op);
+                    if (auto origPred = orig->getSource())
+                    {
+                        origPred->addSuccessors(op);
+                        op->addPredecessors(origPred);
+                    }
+
+                    // Toggle trans flag.
+                    if (inputIdx == 0)
+                        mm->setTransA(!mm->getTransA());
+                    else
+                        mm->setTransB(!mm->getTransB());
+
+                    // If transpose becomes unused, remove it.
+                    if (in->getTargets().empty())
+                    {
+                        detach_op(pred);
+                        eraseOps.insert(pred.get());
+                        changed = true;
+                    }
+                }
+            }
+
+            if (!eraseOps.empty())
+            {
+                ops.erase(std::remove_if(ops.begin(), ops.end(),
+                                         [&](const Operator &op)
+                                         {
+                                             return !op || eraseOps.count(op.get());
+                                         }),
+                          ops.end());
+            }
+
+            // Rule 1: remove adjacent inverse Transpose pairs.
+            for (auto it = ops.begin(); it != ops.end();)
+            {
+                auto &op1 = *it;
+                if (op1->getOpType() != OpType::Transpose)
+                {
+                    ++it;
+                    continue;
+                }
+                auto t1 = std::dynamic_pointer_cast<TransposeObj>(op1);
+                if (!t1)
+                {
+                    ++it;
+                    continue;
+                }
+                auto y = op1->outputs[0];
+                if (!y || y->getTargets().size() != 1)
+                {
+                    ++it;
+                    continue;
+                }
+                auto op2 = y->getTargets()[0];
+                if (!op2 || op2->getOpType() != OpType::Transpose)
+                {
+                    ++it;
+                    continue;
+                }
+                auto t2 = std::dynamic_pointer_cast<TransposeObj>(op2);
+                if (!t2)
+                {
+                    ++it;
+                    continue;
+                }
+                auto z = op2->outputs[0];
+                if (!z)
+                {
+                    ++it;
+                    continue;
+                }
+                // Skip if z is a graph output (no targets) since we cannot safely
+                // replace external tensor references.
+                if (z->getTargets().empty())
+                {
+                    ++it;
+                    continue;
+                }
+
+                auto x = op1->inputs[0];
+                if (!x)
+                {
+                    ++it;
+                    continue;
+                }
+
+                const auto p1 = t1->getPermute();
+                const auto p2 = t2->getPermute();
+                const int rank = static_cast<int>(y->getRank());
+                if (!is_inverse_perm(p1, p2, rank))
+                {
+                    ++it;
+                    continue;
+                }
+
+                // Rewire: replace uses of z with x.
+                auto succs = z->getTargets();
+                for (auto &succ : succs)
+                {
+                    succ->replaceInput(z, x);
+                    z->removeTarget(succ);
+                    x->addTarget(succ);
+
+                    succ->removePredecessors(op2);
+                    op2->removeSuccessors(succ);
+                    if (auto xp = x->getSource())
+                    {
+                        xp->addSuccessors(succ);
+                        succ->addPredecessors(xp);
+                    }
+                }
+
+                // Remove the two transpose ops and their dangling tensors.
+                detach_op(op1);
+                detach_op(op2);
+                ops.erase(std::remove(ops.begin(), ops.end(), op1), ops.end());
+                ops.erase(std::remove(ops.begin(), ops.end(), op2), ops.end());
+                cleanup_dangling_tensors();
+
+                changed = true;
+                // Restart since iterators invalidated.
+                it = ops.begin();
+            }
+
+            if (changed)
+            {
+                sorted = false;
+                IT_ASSERT(topo_sort() == true);
+                cleanup_dangling_tensors();
+            }
+        } while (changed);
     }
 
     Tensor GraphObj::getTensor(int fuid) const
@@ -147,18 +412,98 @@ namespace infini
     {
         // topological sorting first
         IT_ASSERT(topo_sort() == true);
-
         // =================================== 作业 ===================================
         // TODO：利用 allocator 给计算图分配内存
         // HINT: 获取分配好的内存指针后，可以调用 tensor 的 setDataBlob 函数给 tensor 绑定内存
         // =================================== 作业 ===================================
 
+        // Pass 1: simulate allocation to compute offsets and peak memory.
+        std::unordered_map<TensorObj *, size_t> offsetMap;
+        std::unordered_map<TensorObj *, size_t> remainingUses;
+        std::unordered_set<TensorObj *> pinned;
+
+        pinned.reserve(tensors.size());
+        remainingUses.reserve(tensors.size());
+        offsetMap.reserve(tensors.size());
+
+        // Pin graph inputs/outputs: keep their storage alive.
+        for (const auto &t : tensors)
+        {
+            if (!t)
+                continue;
+            if (!t->getSource() || t->getTargets().empty())
+                pinned.insert(t.get());
+            remainingUses.emplace(t.get(), t->getTargets().size());
+        }
+
+        // Allocate graph inputs first (they have no source op).
+        for (const auto &t : tensors)
+        {
+            if (!t)
+                continue;
+            if (!t->getSource())
+            {
+                auto off = allocator.alloc(t->getBytes());
+                offsetMap.emplace(t.get(), off);
+            }
+        }
+
+        // Allocate outputs when produced; free intermediates after last use.
+        for (const auto &op : ops)
+        {
+            // Allocate op outputs
+            for (const auto &out : op->getOutputs())
+            {
+                if (!out)
+                    continue;
+                if (offsetMap.find(out.get()) == offsetMap.end())
+                {
+                    auto off = allocator.alloc(out->getBytes());
+                    offsetMap.emplace(out.get(), off);
+                }
+            }
+
+            // Consume op inputs; free when no longer needed.
+            for (const auto &in : op->getInputs())
+            {
+                if (!in)
+                    continue;
+                auto *tp = in.get();
+                if (pinned.find(tp) != pinned.end())
+                    continue;
+                auto it = remainingUses.find(tp);
+                IT_ASSERT(it != remainingUses.end());
+                IT_ASSERT(it->second > 0);
+                it->second--;
+                if (it->second == 0)
+                {
+                    auto offIt = offsetMap.find(tp);
+                    IT_ASSERT(offIt != offsetMap.end());
+                    allocator.free(offIt->second, in->getBytes());
+                }
+            }
+        }
+
+        // Pass 2: allocate the real arena once, then bind each tensor's blob.
+        void *base = allocator.getPtr();
+        for (const auto &t : tensors)
+        {
+            if (!t)
+                continue;
+            auto it = offsetMap.find(t.get());
+            if (it == offsetMap.end())
+                continue;
+            auto ptr = static_cast<void *>(static_cast<char *>(base) + it->second);
+            t->setDataBlob(make_ref<BlobObj>(runtime, ptr));
+        }
+
         allocator.info();
     }
 
     Tensor GraphObj::addTensor(Shape dim, DataType dtype)
     {
-        return tensors.emplace_back(make_ref<TensorObj>(dim, dtype, runtime));
+        tensors.emplace_back(make_ref<TensorObj>(dim, dtype, runtime));
+        return tensors.back();
     }
 
     Tensor GraphObj::addTensor(const Tensor &tensor)
diff --git a/src/operators/concat.cc b/src/operators/concat.cc
index d196330..4b0472d 100644
--- a/src/operators/concat.cc
+++ b/src/operators/concat.cc
@@ -18,6 +18,24 @@ optional<vector<Shape>> ConcatObj::inferShape(const TensorVec &inputs) {
     // REF: https://onnx.ai/onnx/operators/onnx__Concat.html#concat-13
     // =================================== 作业 ===================================
 
+    IT_ASSERT(!inputs.empty());
+    IT_ASSERT(dim >= 0 && dim < (int)rank);
+
+    int concatSize = dims[dim];
+    for (size_t i = 1; i < inputs.size(); ++i) {
+        IT_ASSERT(inputs[i]->getRank() == rank);
+        const Shape other = inputs[i]->getDims();
+        IT_ASSERT(other.size() == dims.size());
+
+        for (size_t axis = 0; axis < rank; ++axis) {
+            if ((int)axis == dim)
+                continue;
+            IT_ASSERT(other[axis] == dims[axis]);
+        }
+        concatSize += other[dim];
+    }
+    dims[dim] = concatSize;
+
     return {{dims}};
 }
 
diff --git a/src/operators/matmul.cc b/src/operators/matmul.cc
index 7a16ca2..1c66d87 100644
--- a/src/operators/matmul.cc
+++ b/src/operators/matmul.cc
@@ -1,4 +1,5 @@
 #include "operators/matmul.h"
+#include "utils/operator_utils.h"
 
 namespace infini
 {
@@ -27,7 +28,43 @@ namespace infini
         // TODO：返回经过 matmul 操作后的 shape
         // REF: https://github.com/onnx/onnx/blob/main/docs/Operators.md#gemm
         // =================================== 作业 ===================================
-        return std::nullopt;
+
+        IT_ASSERT(inputs.size() == 2);
+        const auto A = inputs[0];
+        const auto B = inputs[1];
+
+        const Shape dimsA = A->getDims();
+        const Shape dimsB = B->getDims();
+        const int rankA = static_cast<int>(dimsA.size());
+        const int rankB = static_cast<int>(dimsB.size());
+        IT_ASSERT(rankA >= 2 && rankB >= 2);
+
+        const int a0 = dimsA[rankA - 2];
+        const int a1 = dimsA[rankA - 1];
+        const int b0 = dimsB[rankB - 2];
+        const int b1 = dimsB[rankB - 1];
+
+        const int m_ = transA ? a1 : a0;
+        const int kA = transA ? a0 : a1;
+        const int kB = transB ? b1 : b0;
+        const int n_ = transB ? b0 : b1;
+
+        IT_ASSERT(kA == kB);
+
+        m = m_;
+        n = n_;
+        k = kA;
+
+        Shape batchA, batchB;
+        if (rankA > 2)
+            batchA = Shape(dimsA.begin(), dimsA.end() - 2);
+        if (rankB > 2)
+            batchB = Shape(dimsB.begin(), dimsB.end() - 2);
+
+        Shape out = infer_broadcast(batchA, batchB);
+        out.push_back(m);
+        out.push_back(n);
+        return {{out}};
     }
 
 } // namespace infini
\ No newline at end of file
diff --git a/src/operators/transpose.cc b/src/operators/transpose.cc
index faab2b6..0e1ff17 100644
--- a/src/operators/transpose.cc
+++ b/src/operators/transpose.cc
@@ -34,7 +34,28 @@ namespace infini
         // REF: https://onnx.ai/onnx/operators/onnx__Transpose.html#transpose-21
         // =================================== 作业 ===================================
 
-        return std::nullopt;
+        IT_ASSERT(static_cast<int>(input_dim.size()) == rank);
+
+        // If perm is not provided, ONNX default is reversing the dimensions.
+        vector<int> perm = transposePermute;
+        if (perm.empty()) {
+            perm.resize(rank);
+            for (int i = 0; i < rank; ++i)
+                perm[i] = rank - 1 - i;
+        }
+
+        IT_ASSERT(static_cast<int>(perm.size()) == rank);
+
+        vector<int> seen(rank, 0);
+        for (int outAxis = 0; outAxis < rank; ++outAxis)
+        {
+            int inAxis = perm[outAxis];
+            IT_ASSERT(inAxis >= 0 && inAxis < rank);
+            IT_ASSERT(++seen[inAxis] == 1);
+            output_dim[outAxis] = input_dim[inAxis];
+        }
+
+        return {{output_dim}};
     }
 
     std::string TransposeObj::toString() const
diff --git a/src/operators/unary.cc b/src/operators/unary.cc
index 3daad36..8cf63f4 100644
--- a/src/operators/unary.cc
+++ b/src/operators/unary.cc
@@ -39,7 +39,8 @@ namespace infini
         // TODO：返回经过 clip 操作后的 shape
         // REF: https://onnx.ai/onnx/operators/onnx__Clip.html#clip-13
         // =================================== 作业 ===================================
-        return std::nullopt;
+        const auto X = inputs[0];
+        return {{X->getDims()}};
     }
 
     std::string ClipObj::toString() const
@@ -66,7 +67,8 @@ namespace infini
         // REF_FILE: src/core/operator.cc
         // REF: https://onnx.ai/onnx/operators/onnx__Cast.html#cast-21
         // =================================== 作业 ===================================
-        return {};
+        (void)inputs;
+        return {getOutputDataType()};
     }
 
     optional<vector<Shape>> CastObj::inferShape(const TensorVec &inputs)
@@ -75,7 +77,8 @@ namespace infini
         // TODO：返回经过 cast 操作后的 shape
         // REF: https://onnx.ai/onnx/operators/onnx__Cast.html#cast-21
         // =================================== 作业 ===================================
-        return std::nullopt;
+        const auto X = inputs[0];
+        return {{X->getDims()}};
     }
 
     std::string CastObj::toString() const
diff --git a/src/utils/operator_utils.cc b/src/utils/operator_utils.cc
index edbd2c8..bb0dd8a 100644
--- a/src/utils/operator_utils.cc
+++ b/src/utils/operator_utils.cc
@@ -9,8 +9,32 @@ Shape infer_broadcast(const Shape &A, const Shape &B) {
     // TODO：对 A 和 B 进行双向广播，返回广播后的形状。
     // REF: https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
     // =================================== 作业 ===================================
-    
-    return {};
+
+    const size_t rankA = A.size();
+    const size_t rankB = B.size();
+    const size_t rank = std::max(rankA, rankB);
+
+    Shape out(rank, 1);
+
+    // Align dimensions from the trailing axis (like NumPy/ONNX broadcasting).
+    for (size_t i = 0; i < rank; ++i) {
+        const size_t aIdx = (i < rank - rankA) ? (size_t)-1 : (i - (rank - rankA));
+        const size_t bIdx = (i < rank - rankB) ? (size_t)-1 : (i - (rank - rankB));
+
+        const int aDim = (aIdx == (size_t)-1) ? 1 : A[aIdx];
+        const int bDim = (bIdx == (size_t)-1) ? 1 : B[bIdx];
+
+        if (aDim == bDim)
+            out[i] = aDim;
+        else if (aDim == 1)
+            out[i] = bDim;
+        else if (bDim == 1)
+            out[i] = aDim;
+        else
+            IT_ASSERT(false, "Broadcast failed: incompatible dimensions");
+    }
+
+    return out;
 }
 
 int get_real_axis(const int &axis, const int &rank) {