Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#0: Saved timestamped data in dispatcher #18790

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tt_metal/api/tt-metalium/cq_commands.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ struct CQDispatchDelayCmd {

struct CQDispatchSetWriteOffsetCmd {
uint8_t pad1;
uint16_t pad2;
uint16_t program_host_id; // Program Host ID for upcoming commands. Used for profiling.
uint32_t offset0;
uint32_t offset1;
uint32_t offset2;
Expand Down
1 change: 1 addition & 0 deletions tt_metal/api/tt-metalium/device_command.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,7 @@ class DeviceCommand {
auto initialize_write_offset_cmd = [&](CQDispatchCmd* write_offset_cmd) {
*write_offset_cmd = {};
write_offset_cmd->base.cmd_id = CQ_DISPATCH_CMD_SET_WRITE_OFFSET;
write_offset_cmd->set_write_offset.program_host_id = 0;
write_offset_cmd->set_write_offset.offset0 = write_offset0;
write_offset_cmd->set_write_offset.offset1 = write_offset1;
write_offset_cmd->set_write_offset.offset2 = write_offset2;
Expand Down
11 changes: 9 additions & 2 deletions tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,10 @@ FORCE_INLINE volatile uint32_t* get_cq_completion_write_ptr() {
return reinterpret_cast<volatile uint32_t*>(dev_completion_q_wr_ptr);
}

constexpr uint32_t DISPATCH_COMMAND_ID_DATA = 1;
constexpr uint32_t DISPATCH_PROGRAM_HOST_ID_DATA = 2;
constexpr uint32_t DISPATCH_COMMAND_SUBTYPE_DATA = 3;

FORCE_INLINE
void completion_queue_reserve_back(uint32_t num_pages) {
WAYPOINT("QRBW");
Expand Down Expand Up @@ -981,6 +985,7 @@ static inline bool process_cmd_d(

re_run_command:
volatile CQDispatchCmd tt_l1_ptr* cmd = (volatile CQDispatchCmd tt_l1_ptr*)cmd_ptr;
DeviceTimestampedData(DISPATCH_COMMAND_ID_DATA, cmd->base.cmd_id);

switch (cmd->base.cmd_id) {
case CQ_DISPATCH_CMD_WRITE_LINEAR:
Expand Down Expand Up @@ -1020,6 +1025,7 @@ static inline bool process_cmd_d(
case CQ_DISPATCH_CMD_WRITE_PACKED: {
DPRINT << "cmd_write_packed" << ENDL();
uint32_t flags = cmd->write_packed.flags;
DeviceTimestampedData(DISPATCH_COMMAND_SUBTYPE_DATA, bool{flags & CQ_DISPATCH_CMD_PACKED_WRITE_FLAG_MCAST});
if (flags & CQ_DISPATCH_CMD_PACKED_WRITE_FLAG_MCAST) {
process_write_packed<true, CQDispatchWritePackedMulticastSubCmd>(
flags, l1_cache, block_noc_writes_to_clear, block_next_start_addr);
Expand Down Expand Up @@ -1083,8 +1089,9 @@ static inline bool process_cmd_d(
case CQ_DISPATCH_SET_GO_SIGNAL_NOC_DATA: set_go_signal_noc_data(); break;

case CQ_DISPATCH_CMD_SET_WRITE_OFFSET:
DPRINT << "write offset: " << cmd->set_write_offset.offset0 << " " << cmd->set_write_offset.offset1 << " "
<< cmd->set_write_offset.offset2 << ENDL();
DPRINT << " " << cmd->set_write_offset.offset0 << " " << cmd->set_write_offset.offset1 << " "
<< cmd->set_write_offset.offset2 << " host id " << cmd->set_write_offset.program_host_id << ENDL();
DeviceTimestampedData(DISPATCH_PROGRAM_HOST_ID_DATA, cmd->set_write_offset.program_host_id);
write_offset[0] = cmd->set_write_offset.offset0;
write_offset[1] = cmd->set_write_offset.offset1;
write_offset[2] = cmd->set_write_offset.offset2;
Expand Down
6 changes: 6 additions & 0 deletions tt_metal/impl/program/dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1535,6 +1535,8 @@ void update_program_dispatch_commands(
(sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, set_write_offset.offset1));
static constexpr uint32_t eth_l1_write_offset_offset =
(sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, set_write_offset.offset2));
static constexpr uint32_t program_host_id_offset =
(sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, set_write_offset.program_host_id));
// Update Stall Command Sequence
if (program_binary_status != ProgramBinaryStatus::Committed) {
// Program binary is in flight. Issue a Prefetch Stall
Expand All @@ -1554,6 +1556,10 @@ void update_program_dispatch_commands(
tensix_l1_write_offset_offset,
&dispatch_md.kernel_config_addrs[hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)],
sizeof(uint32_t));
// May truncate to fit the space.
decltype(std::declval<CQDispatchCmd>().set_write_offset.program_host_id) runtime_id = program.get_runtime_id();
cached_program_command_sequence.preamble_command_sequence.update_cmd_sequence(
program_host_id_offset, &runtime_id, sizeof(runtime_id));
if (hal.get_programmable_core_type_count() >= 2) {
cached_program_command_sequence.preamble_command_sequence.update_cmd_sequence(
eth_l1_write_offset_offset,
Expand Down
Loading