Skip to content

Commit 869ef71

Browse files
jspark1105facebook-github-bot
authored andcommitted
AsyncNet: option for time based tracing and trace path (pytorch#13440)
Summary: Pull Request resolved: pytorch#13440 Time based tracing is easier to look at when multiple nets are running asynchronously. This diff also adds an option to change the path to dump trace files. Reviewed By: aazzolini, ilia-cher Differential Revision: D12479259 fbshipit-source-id: 94d379634ba7b90c111c92b1136ffa4226b8bb8c
1 parent 556ff8e commit 869ef71

File tree

3 files changed

+169
-12
lines changed

3 files changed

+169
-12
lines changed

caffe2/core/net_async_tracing.cc

+66-10
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,18 @@ int getCounterForNetName(const std::string& net_name) {
5353
return counter;
5454
}
5555

56-
Tracer::Tracer(const NetBase* net, const std::string& net_name)
57-
: net_(net), filename_(net_name), iter_(0) {
56+
Tracer::Tracer(
57+
const NetBase* net,
58+
const std::string& net_name,
59+
TracingConfig config)
60+
: net_(net),
61+
filename_(net_name),
62+
iter_(0),
63+
dumping_iter_(0),
64+
config_(config) {
5865
std::replace(filename_.begin(), filename_.end(), '/', '_');
59-
filename_ = FLAGS_caffe2_net_async_tracing_filepath + "/" + filename_ +
60-
+"_id_" + caffe2::to_string(getCounterForNetName(net_name));
66+
filename_ = this->config().filepath + "/" + filename_ + "_id_" +
67+
caffe2::to_string(getCounterForNetName(net_name));
6168
timer_.Start();
6269
}
6370

@@ -251,6 +258,10 @@ int Tracer::bumpIter() {
251258
return iter_++;
252259
}
253260

261+
int Tracer::bumpDumpingIter() {
262+
return dumping_iter_++;
263+
}
264+
254265
void Tracer::dumpTracingResultAndClearEvents(const std::string& file_suffix) {
255266
if (events_.empty() || filename_.empty()) {
256267
return;
@@ -294,7 +305,9 @@ void TracerGuard::addArgument(TracingField field, const char* value) {
294305
event_.category_ = value;
295306
break;
296307
}
297-
default: { CAFFE_THROW("Unexpected tracing string field ", field); }
308+
default: {
309+
CAFFE_THROW("Unexpected tracing string field ", field);
310+
}
298311
}
299312
}
300313

@@ -316,7 +329,9 @@ void TracerGuard::addArgument(TracingField field, int value) {
316329
event_.thread_label_ = value;
317330
break;
318331
}
319-
default: { CAFFE_THROW("Unexpected tracing int field ", field); }
332+
default: {
333+
CAFFE_THROW("Unexpected tracing int field ", field);
334+
}
320335
}
321336
}
322337

@@ -388,25 +403,66 @@ bool hasEnableTracingFlag(const NetBase* net) {
388403
return GetFlagArgument(net->debug_def(), "enable_tracing", false);
389404
}
390405

406+
TracingConfig getTracingConfigFromNet(const NetBase* net) {
407+
ArgumentHelper arg_helper(net->debug_def());
408+
TracingConfig cfg;
409+
410+
cfg.mode = (arg_helper.GetSingleArgument<std::string>("tracing_mode", "") ==
411+
"GLOBAL_TIMESLICE")
412+
? TracingMode::GLOBAL_TIMESLICE
413+
: TracingMode::EVERY_K_ITERATIONS;
414+
415+
cfg.filepath = arg_helper.GetSingleArgument<std::string>(
416+
"tracing_filepath", FLAGS_caffe2_net_async_tracing_filepath);
417+
418+
cfg.trace_every_nth_batch = arg_helper.GetSingleArgument<int>(
419+
"trace_every_nth_batch", FLAGS_caffe2_net_async_tracing_nth);
420+
cfg.dump_every_nth_batch = arg_helper.GetSingleArgument<int>(
421+
"dump_every_nth_batch", FLAGS_caffe2_net_async_tracing_dumping_nth);
422+
423+
cfg.trace_for_n_ms =
424+
arg_helper.GetSingleArgument<int>("trace_for_n_ms", cfg.trace_for_n_ms);
425+
cfg.trace_every_n_ms = arg_helper.GetSingleArgument<int>(
426+
"trace_every_n_ms", cfg.trace_every_n_ms);
427+
428+
return cfg;
429+
};
430+
391431
std::shared_ptr<Tracer> create(
392432
const NetBase* net,
393433
const std::string& net_name) {
394434
// Enable the tracer if the net has the "enable_tracing" argument set OR
395435
// if the command line option includes the net name option in the list of
396436
// tracable nets.
397437
bool trace_net = hasEnableTracingFlag(net) || isTraceableNetName(net_name);
398-
return trace_net ? std::make_shared<Tracer>(net, net_name) : nullptr;
438+
return trace_net
439+
? std::make_shared<Tracer>(net, net_name, getTracingConfigFromNet(net))
440+
: nullptr;
399441
}
400442

401443
bool startIter(const std::shared_ptr<Tracer>& tracer) {
402444
if (!tracer) {
403445
return false;
404446
}
405447
auto iter = tracer->bumpIter();
406-
auto is_enabled = iter % FLAGS_caffe2_net_async_tracing_nth == 0;
448+
bool is_enabled;
449+
bool should_dump;
450+
if (tracer->config().mode == TracingMode::EVERY_K_ITERATIONS) {
451+
is_enabled = iter % tracer->config().trace_every_nth_batch == 0;
452+
should_dump = iter % tracer->config().dump_every_nth_batch == 0;
453+
} else {
454+
using namespace std::chrono;
455+
auto ms =
456+
duration_cast<milliseconds>(system_clock::now().time_since_epoch())
457+
.count();
458+
is_enabled = (ms % tracer->config().trace_every_n_ms) <
459+
tracer->config().trace_for_n_ms;
460+
// dump just after disabled tracing
461+
should_dump = tracer->isEnabled() && !is_enabled;
462+
}
407463
tracer->setEnabled(is_enabled);
408-
if (iter % FLAGS_caffe2_net_async_tracing_dumping_nth == 0) {
409-
int dumping_iter = iter / FLAGS_caffe2_net_async_tracing_dumping_nth;
464+
if (should_dump) {
465+
int dumping_iter = tracer->bumpDumpingIter();
410466
tracer->dumpTracingResultAndClearEvents(caffe2::to_string(dumping_iter));
411467
}
412468
return is_enabled;

caffe2/core/net_async_tracing.h

+31-2
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,30 @@ enum TracingField {
5050
TRACE_CATEGORY,
5151
};
5252

53+
enum class TracingMode {
54+
EVERY_K_ITERATIONS,
55+
GLOBAL_TIMESLICE,
56+
};
57+
58+
struct TracingConfig {
59+
TracingMode mode{TracingMode::EVERY_K_ITERATIONS};
60+
std::string filepath{"/tmp"};
61+
62+
// for TracingMode::EVERY_K_ITERATIONS
63+
int64_t trace_every_nth_batch = 100;
64+
int64_t dump_every_nth_batch = 10000;
65+
66+
// for TracingMode::GLOBAL_TIMESLICE
67+
int64_t trace_every_n_ms = 2 * 60 * 1000; // 2min
68+
int64_t trace_for_n_ms = 1000; // 1sec
69+
};
70+
5371
class CAFFE2_API Tracer {
5472
public:
55-
Tracer(const NetBase* net, const std::string& net_name);
73+
Tracer(
74+
const NetBase* net,
75+
const std::string& net_name,
76+
TracingConfig = TracingConfig{});
5677

5778
void recordEvent(const TracerEvent& event);
5879
std::string opTraceName(const OperatorBase* op);
@@ -62,7 +83,11 @@ class CAFFE2_API Tracer {
6283
void renameThreads();
6384
void setEnabled(bool enabled);
6485
bool isEnabled() const;
86+
const TracingConfig& config() {
87+
return config_;
88+
}
6589
int bumpIter();
90+
int bumpDumpingIter();
6691
// Dump the tracing result to file with given suffix, and then
6792
// clear current events.
6893
void dumpTracingResultAndClearEvents(const std::string& file_suffix);
@@ -77,6 +102,8 @@ class CAFFE2_API Tracer {
77102
bool enabled_ = false;
78103
Timer timer_;
79104
int iter_;
105+
int dumping_iter_;
106+
TracingConfig config_;
80107

81108
friend class TracerGuard;
82109
};
@@ -115,7 +142,9 @@ CAFFE2_API int extractShardId(const std::string& name);
115142
// line flag)
116143
CAFFE2_API bool isTraceableNetName(const std::string& net_name);
117144

118-
CAFFE2_API std::shared_ptr<Tracer> create(const NetBase* net, const std::string& net_name);
145+
CAFFE2_API std::shared_ptr<Tracer> create(
146+
const NetBase* net,
147+
const std::string& net_name);
119148
CAFFE2_API bool startIter(const std::shared_ptr<Tracer>& tracer);
120149

121150
} // namespace tracing

caffe2/core/net_async_tracing_test.cc

+72
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,78 @@ TEST(NetAsyncTracingTest, ExtractShardId) {
3737
testExtractShardId("FC:shard:15", 15);
3838
}
3939

40+
TEST(NetAsyncTracingTest, EveryKIteration) {
41+
const auto spec = R"DOC(
42+
name: "example"
43+
type: "async_scheduling"
44+
arg {
45+
name: "enable_tracing"
46+
i: 1
47+
}
48+
arg {
49+
name: "tracing_mode"
50+
s: "EVERY_K_ITERATIONS"
51+
}
52+
arg {
53+
name: "tracing_filepath"
54+
s: "/tmp"
55+
}
56+
arg {
57+
name: "trace_every_nth_batch"
58+
i: 1
59+
}
60+
arg {
61+
name: "dump_every_nth_batch"
62+
i: 1
63+
}
64+
op {
65+
output: "out"
66+
type: "UniformFill"
67+
}
68+
)DOC";
69+
70+
NetDef net_def;
71+
CAFFE_ENFORCE(TextFormat::ParseFromString(spec, &net_def));
72+
73+
Workspace ws;
74+
std::unique_ptr<NetBase> net(CreateNet(net_def, &ws));
75+
net->Run();
76+
}
77+
78+
TEST(NetAsyncTracingTest, GlobalTimeSlice) {
79+
const auto spec = R"DOC(
80+
name: "example"
81+
type: "async_scheduling"
82+
arg {
83+
name: "enable_tracing"
84+
i: 1
85+
}
86+
arg {
87+
name: "tracing_filepath"
88+
s: "/tmp"
89+
}
90+
arg {
91+
name: "trace_for_n_ms"
92+
i: 1
93+
}
94+
arg {
95+
name: "trace_every_n_ms"
96+
i: 1
97+
}
98+
op {
99+
output: "out"
100+
type: "UniformFill"
101+
}
102+
)DOC";
103+
104+
NetDef net_def;
105+
CAFFE_ENFORCE(TextFormat::ParseFromString(spec, &net_def));
106+
107+
Workspace ws;
108+
std::unique_ptr<NetBase> net(CreateNet(net_def, &ws));
109+
net->Run();
110+
}
111+
40112
} // namespace tracing
41113

42114
} // namespace caffe2

0 commit comments

Comments
 (0)