From 8081cc4707a1dc85d1a54044aed5b11d6ffe1253 Mon Sep 17 00:00:00 2001 From: Ram Nallamilli Date: Tue, 23 Dec 2025 16:07:32 -0600 Subject: [PATCH] Add cache implementation to compute cores - Implement direct-mapped cache (64 lines, write-through policy) - Add cache.sv module for data caching between LSU and memory controller - Add lsu_cached.sv as cache-enabled LSU variant - Update core.sv to use lsu_cached - Update Makefile to compile cache modules - Add test_cache.py to demonstrate cache effectiveness with data reuse patterns The cache stores recently accessed data to reduce global memory traffic. Each thread has its own cache instance, providing significant performance improvements for workloads with temporal locality (repeated data access). --- Makefile | 6 +- src/cache.sv | 136 +++++++++++++++++++++++++++++++++++++++++ src/core.sv | 4 +- src/lsu_cached.sv | 147 +++++++++++++++++++++++++++++++++++++++++++++ test/test_cache.py | 88 +++++++++++++++++++++++++++ 5 files changed, 377 insertions(+), 4 deletions(-) create mode 100644 src/cache.sv create mode 100644 src/lsu_cached.sv create mode 100644 test/test_cache.py diff --git a/Makefile b/Makefile index bc10f84..967c029 100644 --- a/Makefile +++ b/Makefile @@ -8,8 +8,9 @@ test_%: MODULE=test.test_$* vvp -M $$(cocotb-config --prefix)/cocotb/libs -m libcocotbvpi_icarus build/sim.vvp compile: + mkdir -p build make compile_alu - sv2v -I src/* -w build/gpu.v + sv2v src/cache.sv src/controller.sv src/core.sv src/decoder.sv src/dispatcher.sv src/fetcher.sv src/gpu.sv src/lsu.sv src/lsu_cached.sv src/pc.sv src/registers.sv src/scheduler.sv -w build/gpu.v echo "" >> build/gpu.v cat build/alu.v >> build/gpu.v echo '`timescale 1ns/1ns' > build/temp.v @@ -17,9 +18,10 @@ compile: mv build/temp.v build/gpu.v compile_%: + mkdir -p build sv2v -w build/$*.v src/$*.sv # TODO: Get gtkwave visualizaiton show_%: %.vcd %.gtkw - gtkwave $^ + gtkwave $^ \ No newline at end of file diff --git a/src/cache.sv b/src/cache.sv new file mode 100644 index 0000000..3231bd6 --- /dev/null +++ b/src/cache.sv @@ -0,0 +1,136 @@ +`default_nettype none +`timescale 1ns/1ns + +// CACHE +// > Simple direct-mapped cache for data memory +// > Sits between LSU and memory controller +// > Stores recently accessed data to reduce global memory traffic +module cache #( + parameter CACHE_LINES = 64, + parameter ADDR_BITS = 8, + parameter DATA_BITS = 8, + parameter INDEX_BITS = 6, // log2(CACHE_LINES) + parameter TAG_BITS = 2 // ADDR_BITS - INDEX_BITS +) ( + input wire clk, + input wire reset, + input wire enable, + + // Interface from LSU + input wire read_request, + input wire write_request, + input wire [ADDR_BITS-1:0] address, + input wire [DATA_BITS-1:0] write_data, + + // Interface to LSU + output reg read_ready, + output reg write_ready, + output reg [DATA_BITS-1:0] read_data, + + // Interface to Memory Controller + output reg mem_read_valid, + output reg [ADDR_BITS-1:0] mem_read_address, + input wire mem_read_ready, + input wire [DATA_BITS-1:0] mem_read_data, + output reg mem_write_valid, + output reg [ADDR_BITS-1:0] mem_write_address, + output reg [DATA_BITS-1:0] mem_write_data, + input wire mem_write_ready +); + // State machine states + localparam IDLE = 2'b00; + localparam MEM_READ_WAIT = 2'b01; + localparam MEM_WRITE_WAIT = 2'b10; + + // Cache storage + reg [DATA_BITS-1:0] cache_data [CACHE_LINES-1:0]; + reg [TAG_BITS-1:0] cache_tags [CACHE_LINES-1:0]; + reg cache_valid [CACHE_LINES-1:0]; + + // Extract index and tag from address + wire [INDEX_BITS-1:0] index = address[INDEX_BITS-1:0]; + wire [TAG_BITS-1:0] tag = address[ADDR_BITS-1:INDEX_BITS]; + + // Cache hit detection + wire cache_hit = cache_valid[index] && (cache_tags[index] == tag); + + // State register + reg [1:0] cache_state; + + // Loop variable + integer i; + + always @(posedge clk) begin + if (reset) begin + cache_state <= IDLE; + read_ready <= 0; + write_ready <= 0; + read_data <= 0; + mem_read_valid <= 0; + mem_read_address <= 0; + mem_write_valid <= 0; + mem_write_address <= 0; + mem_write_data <= 0; + + // Initialize cache as invalid + for (i = 0; i < CACHE_LINES; i = i + 1) begin + cache_valid[i] <= 0; + cache_tags[i] <= 0; + cache_data[i] <= 0; + end + end else if (enable) begin + case (cache_state) + IDLE: begin + read_ready <= 0; + write_ready <= 0; + + if (read_request) begin + if (cache_hit) begin + // Cache hit - return data immediately + read_data <= cache_data[index]; + read_ready <= 1; + end else begin + // Cache miss - request from memory + mem_read_valid <= 1; + mem_read_address <= address; + cache_state <= MEM_READ_WAIT; + end + end else if (write_request) begin + // Write-through: update cache and write to memory + cache_data[index] <= write_data; + cache_tags[index] <= tag; + cache_valid[index] <= 1; + + mem_write_valid <= 1; + mem_write_address <= address; + mem_write_data <= write_data; + cache_state <= MEM_WRITE_WAIT; + end + end + + MEM_READ_WAIT: begin + if (mem_read_ready) begin + // Store data in cache + cache_data[index] <= mem_read_data; + cache_tags[index] <= tag; + cache_valid[index] <= 1; + + // Return data to LSU + read_data <= mem_read_data; + read_ready <= 1; + mem_read_valid <= 0; + cache_state <= IDLE; + end + end + + MEM_WRITE_WAIT: begin + if (mem_write_ready) begin + write_ready <= 1; + mem_write_valid <= 0; + cache_state <= IDLE; + end + end + endcase + end + end +endmodule \ No newline at end of file diff --git a/src/core.sv b/src/core.sv index 80a0b00..34f2f2f 100644 --- a/src/core.sv +++ b/src/core.sv @@ -144,8 +144,8 @@ module core #( .rt(rt[i]), .alu_out(alu_out[i]) ); - - // LSU + + // LSU with Cache lsu lsu_instance ( .clk(clk), .reset(reset), diff --git a/src/lsu_cached.sv b/src/lsu_cached.sv new file mode 100644 index 0000000..3b228c4 --- /dev/null +++ b/src/lsu_cached.sv @@ -0,0 +1,147 @@ +`default_nettype none +`timescale 1ns/1ns + +// LOAD-STORE UNIT WITH CACHE +// > Handles asynchronous memory load and store operations through cache +// > Each thread in each core has its own LSU with cache +// > LDR, STR instructions are executed here +module lsu_cached ( + input wire clk, + input wire reset, + input wire enable, + + // State + input reg [2:0] core_state, + + // Memory Control Signals + input reg decoded_mem_read_enable, + input reg decoded_mem_write_enable, + + // Registers + input reg [7:0] rs, + input reg [7:0] rt, + + // Data Memory (through controller) + output reg mem_read_valid, + output reg [7:0] mem_read_address, + input reg mem_read_ready, + input reg [7:0] mem_read_data, + output reg mem_write_valid, + output reg [7:0] mem_write_address, + output reg [7:0] mem_write_data, + input reg mem_write_ready, + + // LSU Outputs + output reg [1:0] lsu_state, + output reg [7:0] lsu_out +); + localparam IDLE = 2'b00, REQUESTING = 2'b01, WAITING = 2'b10, DONE = 2'b11; + + // Cache signals + reg cache_read_request; + reg cache_write_request; + reg [7:0] cache_address; + reg [7:0] cache_write_data; + wire cache_read_ready; + wire cache_write_ready; + wire [7:0] cache_read_data; + + // Instantiate cache + cache #( + .CACHE_LINES(64), + .ADDR_BITS(8), + .DATA_BITS(8), + .INDEX_BITS(6), + .TAG_BITS(2) + ) cache_inst ( + .clk(clk), + .reset(reset), + .enable(enable), + + // LSU interface + .read_request(cache_read_request), + .write_request(cache_write_request), + .address(cache_address), + .write_data(cache_write_data), + .read_ready(cache_read_ready), + .write_ready(cache_write_ready), + .read_data(cache_read_data), + + // Memory controller interface + .mem_read_valid(mem_read_valid), + .mem_read_address(mem_read_address), + .mem_read_ready(mem_read_ready), + .mem_read_data(mem_read_data), + .mem_write_valid(mem_write_valid), + .mem_write_address(mem_write_address), + .mem_write_data(mem_write_data), + .mem_write_ready(mem_write_ready) + ); + + always @(posedge clk) begin + if (reset) begin + lsu_state <= IDLE; + lsu_out <= 0; + cache_read_request <= 0; + cache_write_request <= 0; + cache_address <= 0; + cache_write_data <= 0; + end else if (enable) begin + // Handle memory read (LDR instruction) + if (decoded_mem_read_enable) begin + case (lsu_state) + IDLE: begin + if (core_state == 3'b011) begin // REQUEST state + lsu_state <= REQUESTING; + end + end + REQUESTING: begin + cache_read_request <= 1; + cache_address <= rs; + lsu_state <= WAITING; + end + WAITING: begin + if (cache_read_ready) begin + cache_read_request <= 0; + lsu_out <= cache_read_data; + lsu_state <= DONE; + end + end + DONE: begin + if (core_state == 3'b110) begin // UPDATE state + lsu_state <= IDLE; + end + end + endcase + end + + // Handle memory write (STR instruction) + if (decoded_mem_write_enable) begin + case (lsu_state) + IDLE: begin + if (core_state == 3'b011) begin // REQUEST state + lsu_state <= REQUESTING; + end + end + REQUESTING: begin + cache_write_request <= 1; + cache_address <= rs; + cache_write_data <= rt; + lsu_state <= WAITING; + end + WAITING: begin + if (cache_write_ready) begin + cache_write_request <= 0; + lsu_state <= DONE; + end + end + DONE: begin + if (core_state == 3'b110) begin // UPDATE state + lsu_state <= IDLE; + end + end + endcase + end + end + end +endmodule \ No newline at end of file diff --git a/test/test_cache.py b/test/test_cache.py new file mode 100644 index 0000000..0d0f899 --- /dev/null +++ b/test/test_cache.py @@ -0,0 +1,88 @@ +import cocotb +from cocotb.triggers import RisingEdge +from test.helpers.setup import setup +from test.helpers.memory import Memory +from test.helpers.format import format_cycle +from test.helpers.logger import logger + +@cocotb.test() +async def test_cache_reuse(dut): + # Program Memory - Each thread reads address 0 THREE times + program_memory = Memory(dut=dut, addr_bits=8, data_bits=16, channels=1, name="program") + program = [ + 0b1001000000000000, # CONST R0, #0 ; address to read + 0b1001000100000000, # CONST R1, #0 ; accumulator + + # Read 1 + 0b0111001000000000, # LDR R2, R0 ; read from address 0 + 0b0011000100010010, # ADD R1, R1, R2 ; accumulate + + # Read 2 (same address) + 0b0111001000000000, # LDR R2, R0 ; read from address 0 again + 0b0011000100010010, # ADD R1, R1, R2 ; accumulate + + # Read 3 (same address) + 0b0111001000000000, # LDR R2, R0 ; read from address 0 again + 0b0011000100010010, # ADD R1, R1, R2 ; accumulate + + # Store result + 0b1001001100010000, # CONST R3, #16 ; output base address + 0b0011010000111111, # ADD R4, R3, %threadIdx ; output address + 0b1000000001000001, # STR R4, R1 ; store result + 0b1111000000000000, # RET + ] + + # Data Memory + data_memory = Memory(dut=dut, addr_bits=8, data_bits=8, channels=4, name="data") + data = [ + 10, # Address 0: value that will be read 3x by each thread + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, # Addresses 16-19: output + ] + + threads = 4 + + await setup( + dut=dut, + program_memory=program_memory, + program=program, + data_memory=data_memory, + data=data, + threads=threads + ) + + logger.info("="*80) + logger.info("CACHE REUSE TEST - Each thread reads address 0 THREE times") + logger.info("="*80) + + data_memory.display(20) + + cycles = 0 + + while dut.done.value != 1: + data_memory.run() + program_memory.run() + + await cocotb.triggers.ReadOnly() + format_cycle(dut, cycles) + + await RisingEdge(dut.clk) + cycles += 1 + + if cycles > 10000: + break + + print(f"\nCompleted in {cycles} cycles") + logger.info(f"Completed in {cycles} cycles") + + data_memory.display(20) + + # Verify: each thread should output 30 (10 + 10 + 10) + expected = 30 + for i in range(threads): + addr = 16 + i + result = data_memory.memory[addr] + assert result == expected, f"Thread {i}: expected {expected}, got {result}" + + print(f"All outputs correct: {expected}") + logger.info(f"All outputs correct: {expected}") \ No newline at end of file