diff --git a/Makefile b/Makefile index bc10f84..967c029 100644 --- a/Makefile +++ b/Makefile @@ -8,8 +8,9 @@ test_%: MODULE=test.test_$* vvp -M $$(cocotb-config --prefix)/cocotb/libs -m libcocotbvpi_icarus build/sim.vvp compile: + mkdir -p build make compile_alu - sv2v -I src/* -w build/gpu.v + sv2v src/cache.sv src/controller.sv src/core.sv src/decoder.sv src/dispatcher.sv src/fetcher.sv src/gpu.sv src/lsu.sv src/lsu_cached.sv src/pc.sv src/registers.sv src/scheduler.sv -w build/gpu.v echo "" >> build/gpu.v cat build/alu.v >> build/gpu.v echo '`timescale 1ns/1ns' > build/temp.v @@ -17,9 +18,10 @@ compile: mv build/temp.v build/gpu.v compile_%: + mkdir -p build sv2v -w build/$*.v src/$*.sv # TODO: Get gtkwave visualizaiton show_%: %.vcd %.gtkw - gtkwave $^ + gtkwave $^ \ No newline at end of file diff --git a/src/cache.sv b/src/cache.sv new file mode 100644 index 0000000..3231bd6 --- /dev/null +++ b/src/cache.sv @@ -0,0 +1,136 @@ +`default_nettype none +`timescale 1ns/1ns + +// CACHE +// > Simple direct-mapped cache for data memory +// > Sits between LSU and memory controller +// > Stores recently accessed data to reduce global memory traffic +module cache #( + parameter CACHE_LINES = 64, + parameter ADDR_BITS = 8, + parameter DATA_BITS = 8, + parameter INDEX_BITS = 6, // log2(CACHE_LINES) + parameter TAG_BITS = 2 // ADDR_BITS - INDEX_BITS +) ( + input wire clk, + input wire reset, + input wire enable, + + // Interface from LSU + input wire read_request, + input wire write_request, + input wire [ADDR_BITS-1:0] address, + input wire [DATA_BITS-1:0] write_data, + + // Interface to LSU + output reg read_ready, + output reg write_ready, + output reg [DATA_BITS-1:0] read_data, + + // Interface to Memory Controller + output reg mem_read_valid, + output reg [ADDR_BITS-1:0] mem_read_address, + input wire mem_read_ready, + input wire [DATA_BITS-1:0] mem_read_data, + output reg mem_write_valid, + output reg [ADDR_BITS-1:0] mem_write_address, + output reg [DATA_BITS-1:0] mem_write_data, + input wire mem_write_ready +); + // State machine states + localparam IDLE = 2'b00; + localparam MEM_READ_WAIT = 2'b01; + localparam MEM_WRITE_WAIT = 2'b10; + + // Cache storage + reg [DATA_BITS-1:0] cache_data [CACHE_LINES-1:0]; + reg [TAG_BITS-1:0] cache_tags [CACHE_LINES-1:0]; + reg cache_valid [CACHE_LINES-1:0]; + + // Extract index and tag from address + wire [INDEX_BITS-1:0] index = address[INDEX_BITS-1:0]; + wire [TAG_BITS-1:0] tag = address[ADDR_BITS-1:INDEX_BITS]; + + // Cache hit detection + wire cache_hit = cache_valid[index] && (cache_tags[index] == tag); + + // State register + reg [1:0] cache_state; + + // Loop variable + integer i; + + always @(posedge clk) begin + if (reset) begin + cache_state <= IDLE; + read_ready <= 0; + write_ready <= 0; + read_data <= 0; + mem_read_valid <= 0; + mem_read_address <= 0; + mem_write_valid <= 0; + mem_write_address <= 0; + mem_write_data <= 0; + + // Initialize cache as invalid + for (i = 0; i < CACHE_LINES; i = i + 1) begin + cache_valid[i] <= 0; + cache_tags[i] <= 0; + cache_data[i] <= 0; + end + end else if (enable) begin + case (cache_state) + IDLE: begin + read_ready <= 0; + write_ready <= 0; + + if (read_request) begin + if (cache_hit) begin + // Cache hit - return data immediately + read_data <= cache_data[index]; + read_ready <= 1; + end else begin + // Cache miss - request from memory + mem_read_valid <= 1; + mem_read_address <= address; + cache_state <= MEM_READ_WAIT; + end + end else if (write_request) begin + // Write-through: update cache and write to memory + cache_data[index] <= write_data; + cache_tags[index] <= tag; + cache_valid[index] <= 1; + + mem_write_valid <= 1; + mem_write_address <= address; + mem_write_data <= write_data; + cache_state <= MEM_WRITE_WAIT; + end + end + + MEM_READ_WAIT: begin + if (mem_read_ready) begin + // Store data in cache + cache_data[index] <= mem_read_data; + cache_tags[index] <= tag; + cache_valid[index] <= 1; + + // Return data to LSU + read_data <= mem_read_data; + read_ready <= 1; + mem_read_valid <= 0; + cache_state <= IDLE; + end + end + + MEM_WRITE_WAIT: begin + if (mem_write_ready) begin + write_ready <= 1; + mem_write_valid <= 0; + cache_state <= IDLE; + end + end + endcase + end + end +endmodule \ No newline at end of file diff --git a/src/core.sv b/src/core.sv index 80a0b00..34f2f2f 100644 --- a/src/core.sv +++ b/src/core.sv @@ -144,8 +144,8 @@ module core #( .rt(rt[i]), .alu_out(alu_out[i]) ); - - // LSU + + // LSU with Cache lsu lsu_instance ( .clk(clk), .reset(reset), diff --git a/src/lsu_cached.sv b/src/lsu_cached.sv new file mode 100644 index 0000000..3b228c4 --- /dev/null +++ b/src/lsu_cached.sv @@ -0,0 +1,147 @@ +`default_nettype none +`timescale 1ns/1ns + +// LOAD-STORE UNIT WITH CACHE +// > Handles asynchronous memory load and store operations through cache +// > Each thread in each core has its own LSU with cache +// > LDR, STR instructions are executed here +module lsu_cached ( + input wire clk, + input wire reset, + input wire enable, + + // State + input reg [2:0] core_state, + + // Memory Control Signals + input reg decoded_mem_read_enable, + input reg decoded_mem_write_enable, + + // Registers + input reg [7:0] rs, + input reg [7:0] rt, + + // Data Memory (through controller) + output reg mem_read_valid, + output reg [7:0] mem_read_address, + input reg mem_read_ready, + input reg [7:0] mem_read_data, + output reg mem_write_valid, + output reg [7:0] mem_write_address, + output reg [7:0] mem_write_data, + input reg mem_write_ready, + + // LSU Outputs + output reg [1:0] lsu_state, + output reg [7:0] lsu_out +); + localparam IDLE = 2'b00, REQUESTING = 2'b01, WAITING = 2'b10, DONE = 2'b11; + + // Cache signals + reg cache_read_request; + reg cache_write_request; + reg [7:0] cache_address; + reg [7:0] cache_write_data; + wire cache_read_ready; + wire cache_write_ready; + wire [7:0] cache_read_data; + + // Instantiate cache + cache #( + .CACHE_LINES(64), + .ADDR_BITS(8), + .DATA_BITS(8), + .INDEX_BITS(6), + .TAG_BITS(2) + ) cache_inst ( + .clk(clk), + .reset(reset), + .enable(enable), + + // LSU interface + .read_request(cache_read_request), + .write_request(cache_write_request), + .address(cache_address), + .write_data(cache_write_data), + .read_ready(cache_read_ready), + .write_ready(cache_write_ready), + .read_data(cache_read_data), + + // Memory controller interface + .mem_read_valid(mem_read_valid), + .mem_read_address(mem_read_address), + .mem_read_ready(mem_read_ready), + .mem_read_data(mem_read_data), + .mem_write_valid(mem_write_valid), + .mem_write_address(mem_write_address), + .mem_write_data(mem_write_data), + .mem_write_ready(mem_write_ready) + ); + + always @(posedge clk) begin + if (reset) begin + lsu_state <= IDLE; + lsu_out <= 0; + cache_read_request <= 0; + cache_write_request <= 0; + cache_address <= 0; + cache_write_data <= 0; + end else if (enable) begin + // Handle memory read (LDR instruction) + if (decoded_mem_read_enable) begin + case (lsu_state) + IDLE: begin + if (core_state == 3'b011) begin // REQUEST state + lsu_state <= REQUESTING; + end + end + REQUESTING: begin + cache_read_request <= 1; + cache_address <= rs; + lsu_state <= WAITING; + end + WAITING: begin + if (cache_read_ready) begin + cache_read_request <= 0; + lsu_out <= cache_read_data; + lsu_state <= DONE; + end + end + DONE: begin + if (core_state == 3'b110) begin // UPDATE state + lsu_state <= IDLE; + end + end + endcase + end + + // Handle memory write (STR instruction) + if (decoded_mem_write_enable) begin + case (lsu_state) + IDLE: begin + if (core_state == 3'b011) begin // REQUEST state + lsu_state <= REQUESTING; + end + end + REQUESTING: begin + cache_write_request <= 1; + cache_address <= rs; + cache_write_data <= rt; + lsu_state <= WAITING; + end + WAITING: begin + if (cache_write_ready) begin + cache_write_request <= 0; + lsu_state <= DONE; + end + end + DONE: begin + if (core_state == 3'b110) begin // UPDATE state + lsu_state <= IDLE; + end + end + endcase + end + end + end +endmodule \ No newline at end of file diff --git a/test/test_cache.py b/test/test_cache.py new file mode 100644 index 0000000..0d0f899 --- /dev/null +++ b/test/test_cache.py @@ -0,0 +1,88 @@ +import cocotb +from cocotb.triggers import RisingEdge +from test.helpers.setup import setup +from test.helpers.memory import Memory +from test.helpers.format import format_cycle +from test.helpers.logger import logger + +@cocotb.test() +async def test_cache_reuse(dut): + # Program Memory - Each thread reads address 0 THREE times + program_memory = Memory(dut=dut, addr_bits=8, data_bits=16, channels=1, name="program") + program = [ + 0b1001000000000000, # CONST R0, #0 ; address to read + 0b1001000100000000, # CONST R1, #0 ; accumulator + + # Read 1 + 0b0111001000000000, # LDR R2, R0 ; read from address 0 + 0b0011000100010010, # ADD R1, R1, R2 ; accumulate + + # Read 2 (same address) + 0b0111001000000000, # LDR R2, R0 ; read from address 0 again + 0b0011000100010010, # ADD R1, R1, R2 ; accumulate + + # Read 3 (same address) + 0b0111001000000000, # LDR R2, R0 ; read from address 0 again + 0b0011000100010010, # ADD R1, R1, R2 ; accumulate + + # Store result + 0b1001001100010000, # CONST R3, #16 ; output base address + 0b0011010000111111, # ADD R4, R3, %threadIdx ; output address + 0b1000000001000001, # STR R4, R1 ; store result + 0b1111000000000000, # RET + ] + + # Data Memory + data_memory = Memory(dut=dut, addr_bits=8, data_bits=8, channels=4, name="data") + data = [ + 10, # Address 0: value that will be read 3x by each thread + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, # Addresses 16-19: output + ] + + threads = 4 + + await setup( + dut=dut, + program_memory=program_memory, + program=program, + data_memory=data_memory, + data=data, + threads=threads + ) + + logger.info("="*80) + logger.info("CACHE REUSE TEST - Each thread reads address 0 THREE times") + logger.info("="*80) + + data_memory.display(20) + + cycles = 0 + + while dut.done.value != 1: + data_memory.run() + program_memory.run() + + await cocotb.triggers.ReadOnly() + format_cycle(dut, cycles) + + await RisingEdge(dut.clk) + cycles += 1 + + if cycles > 10000: + break + + print(f"\nCompleted in {cycles} cycles") + logger.info(f"Completed in {cycles} cycles") + + data_memory.display(20) + + # Verify: each thread should output 30 (10 + 10 + 10) + expected = 30 + for i in range(threads): + addr = 16 + i + result = data_memory.memory[addr] + assert result == expected, f"Thread {i}: expected {expected}, got {result}" + + print(f"All outputs correct: {expected}") + logger.info(f"All outputs correct: {expected}") \ No newline at end of file