diff --git a/.gitignore b/.gitignore index 8586c55..047111b 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ test/logs/* gds/**/*.gltf .DS_Store -results.xml \ No newline at end of file +results.xml +docs/*.md \ No newline at end of file diff --git a/src/controller.sv b/src/controller.sv index eeedef2..158e316 100644 --- a/src/controller.sv +++ b/src/controller.sv @@ -64,14 +64,18 @@ module controller #( channel_serving_consumer = 0; end else begin + // Local variable to handle arbitration updates within the same cycle + reg [NUM_CONSUMERS-1:0] next_channel_serving_consumer; + next_channel_serving_consumer = channel_serving_consumer; + // For each channel, we handle processing concurrently for (int i = 0; i < NUM_CHANNELS; i = i + 1) begin case (controller_state[i]) IDLE: begin // While this channel is idle, cycle through consumers looking for one with a pending request for (int j = 0; j < NUM_CONSUMERS; j = j + 1) begin - if (consumer_read_valid[j] && !channel_serving_consumer[j]) begin - channel_serving_consumer[j] = 1; + if (consumer_read_valid[j] && !next_channel_serving_consumer[j]) begin + next_channel_serving_consumer[j] = 1; current_consumer[i] <= j; mem_read_valid[i] <= 1; @@ -80,8 +84,8 @@ module controller #( // Once we find a pending request, pick it up with this channel and stop looking for requests break; - end else if (consumer_write_valid[j] && !channel_serving_consumer[j]) begin - channel_serving_consumer[j] = 1; + end else if (consumer_write_valid[j] && !next_channel_serving_consumer[j]) begin + next_channel_serving_consumer[j] = 1; current_consumer[i] <= j; mem_write_valid[i] <= 1; @@ -114,20 +118,23 @@ module controller #( // Wait until consumer acknowledges it received response, then reset READ_RELAYING: begin if (!consumer_read_valid[current_consumer[i]]) begin - channel_serving_consumer[current_consumer[i]] = 0; + next_channel_serving_consumer[current_consumer[i]] = 0; consumer_read_ready[current_consumer[i]] <= 0; controller_state[i] <= IDLE; end end WRITE_RELAYING: begin if (!consumer_write_valid[current_consumer[i]]) begin - channel_serving_consumer[current_consumer[i]] = 0; + next_channel_serving_consumer[current_consumer[i]] = 0; consumer_write_ready[current_consumer[i]] <= 0; controller_state[i] <= IDLE; end end endcase end + + // Update the state register + channel_serving_consumer <= next_channel_serving_consumer; end end endmodule diff --git a/src/core.sv b/src/core.sv index 80a0b00..ecdeb1c 100644 --- a/src/core.sv +++ b/src/core.sv @@ -24,52 +24,53 @@ module core #( input wire [$clog2(THREADS_PER_BLOCK):0] thread_count, // Program Memory - output reg program_mem_read_valid, - output reg [PROGRAM_MEM_ADDR_BITS-1:0] program_mem_read_address, + // Program Memory + output wire program_mem_read_valid, + output wire [PROGRAM_MEM_ADDR_BITS-1:0] program_mem_read_address, input reg program_mem_read_ready, input reg [PROGRAM_MEM_DATA_BITS-1:0] program_mem_read_data, // Data Memory - output reg [THREADS_PER_BLOCK-1:0] data_mem_read_valid, - output reg [DATA_MEM_ADDR_BITS-1:0] data_mem_read_address [THREADS_PER_BLOCK-1:0], + output wire [THREADS_PER_BLOCK-1:0] data_mem_read_valid, + output wire [DATA_MEM_ADDR_BITS-1:0] data_mem_read_address [THREADS_PER_BLOCK-1:0], input reg [THREADS_PER_BLOCK-1:0] data_mem_read_ready, input reg [DATA_MEM_DATA_BITS-1:0] data_mem_read_data [THREADS_PER_BLOCK-1:0], - output reg [THREADS_PER_BLOCK-1:0] data_mem_write_valid, - output reg [DATA_MEM_ADDR_BITS-1:0] data_mem_write_address [THREADS_PER_BLOCK-1:0], - output reg [DATA_MEM_DATA_BITS-1:0] data_mem_write_data [THREADS_PER_BLOCK-1:0], + output wire [THREADS_PER_BLOCK-1:0] data_mem_write_valid, + output wire [DATA_MEM_ADDR_BITS-1:0] data_mem_write_address [THREADS_PER_BLOCK-1:0], + output wire [DATA_MEM_DATA_BITS-1:0] data_mem_write_data [THREADS_PER_BLOCK-1:0], input reg [THREADS_PER_BLOCK-1:0] data_mem_write_ready ); // State - reg [2:0] core_state; - reg [2:0] fetcher_state; - reg [15:0] instruction; + wire [2:0] core_state; + wire [2:0] fetcher_state; + wire [15:0] instruction; // Intermediate Signals - reg [7:0] current_pc; + wire [7:0] current_pc; wire [7:0] next_pc[THREADS_PER_BLOCK-1:0]; - reg [7:0] rs[THREADS_PER_BLOCK-1:0]; - reg [7:0] rt[THREADS_PER_BLOCK-1:0]; - reg [1:0] lsu_state[THREADS_PER_BLOCK-1:0]; - reg [7:0] lsu_out[THREADS_PER_BLOCK-1:0]; + wire [7:0] rs[THREADS_PER_BLOCK-1:0]; + wire [7:0] rt[THREADS_PER_BLOCK-1:0]; + wire [1:0] lsu_state[THREADS_PER_BLOCK-1:0]; + wire [7:0] lsu_out[THREADS_PER_BLOCK-1:0]; wire [7:0] alu_out[THREADS_PER_BLOCK-1:0]; // Decoded Instruction Signals - reg [3:0] decoded_rd_address; - reg [3:0] decoded_rs_address; - reg [3:0] decoded_rt_address; - reg [2:0] decoded_nzp; - reg [7:0] decoded_immediate; + wire [3:0] decoded_rd_address; + wire [3:0] decoded_rs_address; + wire [3:0] decoded_rt_address; + wire [2:0] decoded_nzp; + wire [7:0] decoded_immediate; // Decoded Control Signals - reg decoded_reg_write_enable; // Enable writing to a register - reg decoded_mem_read_enable; // Enable reading from memory - reg decoded_mem_write_enable; // Enable writing to memory - reg decoded_nzp_write_enable; // Enable writing to NZP register - reg [1:0] decoded_reg_input_mux; // Select input to register - reg [1:0] decoded_alu_arithmetic_mux; // Select arithmetic operation - reg decoded_alu_output_mux; // Select operation in ALU - reg decoded_pc_mux; // Select source of next PC - reg decoded_ret; + wire decoded_reg_write_enable; // Enable writing to a register + wire decoded_mem_read_enable; // Enable reading from memory + wire decoded_mem_write_enable; // Enable writing to memory + wire decoded_nzp_write_enable; // Enable writing to NZP register + wire [1:0] decoded_reg_input_mux; // Select input to register + wire [1:0] decoded_alu_arithmetic_mux; // Select arithmetic operation + wire decoded_alu_output_mux; // Select operation in ALU + wire decoded_pc_mux; // Select source of next PC + wire decoded_ret; // Fetcher fetcher #( diff --git a/src/dcr.sv b/src/dcr.sv index 97c0b41..8561698 100644 --- a/src/dcr.sv +++ b/src/dcr.sv @@ -10,18 +10,18 @@ module dcr ( input wire device_control_write_enable, input wire [7:0] device_control_data, - output wire [7:0] thread_count, + output wire [7:0] thread_count ); // Store device control data in dedicated register - reg [7:0] device_conrol_register; - assign thread_count = device_conrol_register[7:0]; + reg [7:0] device_control_register; + assign thread_count = device_control_register[7:0]; always @(posedge clk) begin if (reset) begin - device_conrol_register <= 8'b0; + device_control_register <= 8'b0; end else begin if (device_control_write_enable) begin - device_conrol_register <= device_control_data; + device_control_register <= device_control_data; end end end diff --git a/src/dispatch.sv b/src/dispatch.sv index f1d5d55..908a595 100644 --- a/src/dispatch.sv +++ b/src/dispatch.sv @@ -74,7 +74,7 @@ module dispatch #( ? thread_count - (blocks_dispatched * THREADS_PER_BLOCK) : THREADS_PER_BLOCK; - blocks_dispatched = blocks_dispatched + 1; + blocks_dispatched <= blocks_dispatched + 1; end end end @@ -84,7 +84,7 @@ module dispatch #( // If a core just finished executing it's current block, reset it core_reset[i] <= 1; core_start[i] <= 0; - blocks_done = blocks_done + 1; + blocks_done <= blocks_done + 1; end end end diff --git a/src/gpu.sv b/src/gpu.sv index e3d8fcd..2776704 100644 --- a/src/gpu.sv +++ b/src/gpu.sv @@ -189,7 +189,7 @@ module gpu #( .DATA_MEM_DATA_BITS(DATA_MEM_DATA_BITS), .PROGRAM_MEM_ADDR_BITS(PROGRAM_MEM_ADDR_BITS), .PROGRAM_MEM_DATA_BITS(PROGRAM_MEM_DATA_BITS), - .THREADS_PER_BLOCK(THREADS_PER_BLOCK), + .THREADS_PER_BLOCK(THREADS_PER_BLOCK) ) core_instance ( .clk(clk), .reset(core_reset[i]), diff --git a/src/scheduler.sv b/src/scheduler.sv index 6838f91..6dc69d6 100644 --- a/src/scheduler.sv +++ b/src/scheduler.sv @@ -14,7 +14,7 @@ // > Technically, different instructions can branch to different PCs, requiring "branch divergence." In // this minimal implementation, we assume no branch divergence (naive approach for simplicity) module scheduler #( - parameter THREADS_PER_BLOCK = 4, + parameter THREADS_PER_BLOCK = 4 ) ( input wire clk, input wire reset, @@ -76,7 +76,9 @@ module scheduler #( end WAIT: begin // Wait for all LSUs to finish their request before continuing - reg any_lsu_waiting = 1'b0; + logic any_lsu_waiting; + any_lsu_waiting = 1'b0; + for (int i = 0; i < THREADS_PER_BLOCK; i++) begin // Make sure no lsu_state = REQUESTING or WAITING if (lsu_state[i] == 2'b01 || lsu_state[i] == 2'b10) begin @@ -101,7 +103,8 @@ module scheduler #( core_state <= DONE; end else begin // TODO: Branch divergence. For now assume all next_pc converge - current_pc <= next_pc[THREADS_PER_BLOCK-1]; + // Use Thread 0's PC since it is guaranteed to be active if the block is active + current_pc <= next_pc[0]; // Update is synchronous so we move on after one cycle core_state <= FETCH;