Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ test/logs/*
gds/**/*.gltf

.DS_Store
results.xml
results.xml
docs/*.md
19 changes: 13 additions & 6 deletions src/controller.sv
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,18 @@ module controller #(

channel_serving_consumer = 0;
end else begin
// Local variable to handle arbitration updates within the same cycle
reg [NUM_CONSUMERS-1:0] next_channel_serving_consumer;
next_channel_serving_consumer = channel_serving_consumer;

// For each channel, we handle processing concurrently
for (int i = 0; i < NUM_CHANNELS; i = i + 1) begin
case (controller_state[i])
IDLE: begin
// While this channel is idle, cycle through consumers looking for one with a pending request
for (int j = 0; j < NUM_CONSUMERS; j = j + 1) begin
if (consumer_read_valid[j] && !channel_serving_consumer[j]) begin
channel_serving_consumer[j] = 1;
if (consumer_read_valid[j] && !next_channel_serving_consumer[j]) begin
next_channel_serving_consumer[j] = 1;
current_consumer[i] <= j;

mem_read_valid[i] <= 1;
Expand All @@ -80,8 +84,8 @@ module controller #(

// Once we find a pending request, pick it up with this channel and stop looking for requests
break;
end else if (consumer_write_valid[j] && !channel_serving_consumer[j]) begin
channel_serving_consumer[j] = 1;
end else if (consumer_write_valid[j] && !next_channel_serving_consumer[j]) begin
next_channel_serving_consumer[j] = 1;
current_consumer[i] <= j;

mem_write_valid[i] <= 1;
Expand Down Expand Up @@ -114,20 +118,23 @@ module controller #(
// Wait until consumer acknowledges it received response, then reset
READ_RELAYING: begin
if (!consumer_read_valid[current_consumer[i]]) begin
channel_serving_consumer[current_consumer[i]] = 0;
next_channel_serving_consumer[current_consumer[i]] = 0;
consumer_read_ready[current_consumer[i]] <= 0;
controller_state[i] <= IDLE;
end
end
WRITE_RELAYING: begin
if (!consumer_write_valid[current_consumer[i]]) begin
channel_serving_consumer[current_consumer[i]] = 0;
next_channel_serving_consumer[current_consumer[i]] = 0;
consumer_write_ready[current_consumer[i]] <= 0;
controller_state[i] <= IDLE;
end
end
endcase
end

// Update the state register
channel_serving_consumer <= next_channel_serving_consumer;
end
end
endmodule
59 changes: 30 additions & 29 deletions src/core.sv
Original file line number Diff line number Diff line change
Expand Up @@ -24,52 +24,53 @@ module core #(
input wire [$clog2(THREADS_PER_BLOCK):0] thread_count,

// Program Memory
output reg program_mem_read_valid,
output reg [PROGRAM_MEM_ADDR_BITS-1:0] program_mem_read_address,
// Program Memory
output wire program_mem_read_valid,
output wire [PROGRAM_MEM_ADDR_BITS-1:0] program_mem_read_address,
input reg program_mem_read_ready,
input reg [PROGRAM_MEM_DATA_BITS-1:0] program_mem_read_data,

// Data Memory
output reg [THREADS_PER_BLOCK-1:0] data_mem_read_valid,
output reg [DATA_MEM_ADDR_BITS-1:0] data_mem_read_address [THREADS_PER_BLOCK-1:0],
output wire [THREADS_PER_BLOCK-1:0] data_mem_read_valid,
output wire [DATA_MEM_ADDR_BITS-1:0] data_mem_read_address [THREADS_PER_BLOCK-1:0],
input reg [THREADS_PER_BLOCK-1:0] data_mem_read_ready,
input reg [DATA_MEM_DATA_BITS-1:0] data_mem_read_data [THREADS_PER_BLOCK-1:0],
output reg [THREADS_PER_BLOCK-1:0] data_mem_write_valid,
output reg [DATA_MEM_ADDR_BITS-1:0] data_mem_write_address [THREADS_PER_BLOCK-1:0],
output reg [DATA_MEM_DATA_BITS-1:0] data_mem_write_data [THREADS_PER_BLOCK-1:0],
output wire [THREADS_PER_BLOCK-1:0] data_mem_write_valid,
output wire [DATA_MEM_ADDR_BITS-1:0] data_mem_write_address [THREADS_PER_BLOCK-1:0],
output wire [DATA_MEM_DATA_BITS-1:0] data_mem_write_data [THREADS_PER_BLOCK-1:0],
input reg [THREADS_PER_BLOCK-1:0] data_mem_write_ready
);
// State
reg [2:0] core_state;
reg [2:0] fetcher_state;
reg [15:0] instruction;
wire [2:0] core_state;
wire [2:0] fetcher_state;
wire [15:0] instruction;

// Intermediate Signals
reg [7:0] current_pc;
wire [7:0] current_pc;
wire [7:0] next_pc[THREADS_PER_BLOCK-1:0];
reg [7:0] rs[THREADS_PER_BLOCK-1:0];
reg [7:0] rt[THREADS_PER_BLOCK-1:0];
reg [1:0] lsu_state[THREADS_PER_BLOCK-1:0];
reg [7:0] lsu_out[THREADS_PER_BLOCK-1:0];
wire [7:0] rs[THREADS_PER_BLOCK-1:0];
wire [7:0] rt[THREADS_PER_BLOCK-1:0];
wire [1:0] lsu_state[THREADS_PER_BLOCK-1:0];
wire [7:0] lsu_out[THREADS_PER_BLOCK-1:0];
wire [7:0] alu_out[THREADS_PER_BLOCK-1:0];

// Decoded Instruction Signals
reg [3:0] decoded_rd_address;
reg [3:0] decoded_rs_address;
reg [3:0] decoded_rt_address;
reg [2:0] decoded_nzp;
reg [7:0] decoded_immediate;
wire [3:0] decoded_rd_address;
wire [3:0] decoded_rs_address;
wire [3:0] decoded_rt_address;
wire [2:0] decoded_nzp;
wire [7:0] decoded_immediate;

// Decoded Control Signals
reg decoded_reg_write_enable; // Enable writing to a register
reg decoded_mem_read_enable; // Enable reading from memory
reg decoded_mem_write_enable; // Enable writing to memory
reg decoded_nzp_write_enable; // Enable writing to NZP register
reg [1:0] decoded_reg_input_mux; // Select input to register
reg [1:0] decoded_alu_arithmetic_mux; // Select arithmetic operation
reg decoded_alu_output_mux; // Select operation in ALU
reg decoded_pc_mux; // Select source of next PC
reg decoded_ret;
wire decoded_reg_write_enable; // Enable writing to a register
wire decoded_mem_read_enable; // Enable reading from memory
wire decoded_mem_write_enable; // Enable writing to memory
wire decoded_nzp_write_enable; // Enable writing to NZP register
wire [1:0] decoded_reg_input_mux; // Select input to register
wire [1:0] decoded_alu_arithmetic_mux; // Select arithmetic operation
wire decoded_alu_output_mux; // Select operation in ALU
wire decoded_pc_mux; // Select source of next PC
wire decoded_ret;

// Fetcher
fetcher #(
Expand Down
10 changes: 5 additions & 5 deletions src/dcr.sv
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,18 @@ module dcr (

input wire device_control_write_enable,
input wire [7:0] device_control_data,
output wire [7:0] thread_count,
output wire [7:0] thread_count
);
// Store device control data in dedicated register
reg [7:0] device_conrol_register;
assign thread_count = device_conrol_register[7:0];
reg [7:0] device_control_register;
assign thread_count = device_control_register[7:0];

always @(posedge clk) begin
if (reset) begin
device_conrol_register <= 8'b0;
device_control_register <= 8'b0;
end else begin
if (device_control_write_enable) begin
device_conrol_register <= device_control_data;
device_control_register <= device_control_data;
end
end
end
Expand Down
4 changes: 2 additions & 2 deletions src/dispatch.sv
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ module dispatch #(
? thread_count - (blocks_dispatched * THREADS_PER_BLOCK)
: THREADS_PER_BLOCK;

blocks_dispatched = blocks_dispatched + 1;
blocks_dispatched <= blocks_dispatched + 1;
end
end
end
Expand All @@ -84,7 +84,7 @@ module dispatch #(
// If a core just finished executing it's current block, reset it
core_reset[i] <= 1;
core_start[i] <= 0;
blocks_done = blocks_done + 1;
blocks_done <= blocks_done + 1;
end
end
end
Expand Down
2 changes: 1 addition & 1 deletion src/gpu.sv
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ module gpu #(
.DATA_MEM_DATA_BITS(DATA_MEM_DATA_BITS),
.PROGRAM_MEM_ADDR_BITS(PROGRAM_MEM_ADDR_BITS),
.PROGRAM_MEM_DATA_BITS(PROGRAM_MEM_DATA_BITS),
.THREADS_PER_BLOCK(THREADS_PER_BLOCK),
.THREADS_PER_BLOCK(THREADS_PER_BLOCK)
) core_instance (
.clk(clk),
.reset(core_reset[i]),
Expand Down
9 changes: 6 additions & 3 deletions src/scheduler.sv
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
// > Technically, different instructions can branch to different PCs, requiring "branch divergence." In
// this minimal implementation, we assume no branch divergence (naive approach for simplicity)
module scheduler #(
parameter THREADS_PER_BLOCK = 4,
parameter THREADS_PER_BLOCK = 4
) (
input wire clk,
input wire reset,
Expand Down Expand Up @@ -76,7 +76,9 @@ module scheduler #(
end
WAIT: begin
// Wait for all LSUs to finish their request before continuing
reg any_lsu_waiting = 1'b0;
logic any_lsu_waiting;
any_lsu_waiting = 1'b0;

for (int i = 0; i < THREADS_PER_BLOCK; i++) begin
// Make sure no lsu_state = REQUESTING or WAITING
if (lsu_state[i] == 2'b01 || lsu_state[i] == 2'b10) begin
Expand All @@ -101,7 +103,8 @@ module scheduler #(
core_state <= DONE;
end else begin
// TODO: Branch divergence. For now assume all next_pc converge
current_pc <= next_pc[THREADS_PER_BLOCK-1];
// Use Thread 0's PC since it is guaranteed to be active if the block is active
current_pc <= next_pc[0];

// Update is synchronous so we move on after one cycle
core_state <= FETCH;
Expand Down