Skip to content

Commit e84b891

Browse files
authored
Remote link discovery (#1572)
* Changed restart so that is can restart with new parallelism as long as the total number of threads in the run is the same (i.e. no need to change what is in each partition, just remap partitions to new rank/thread pairs). For any new parallelism, SST will automatically discover which rank remote Links are on." * Fixed link connection logic in test_StatisticsComponent_basic.py and add tests that do remapping on a restart. * Move some Link APIs to be protected and add functions to ThreadSync which is a friend of Link so child classes can access the moved functionality. * Fix typos in comments and output strings
1 parent 046307a commit e84b891

File tree

13 files changed

+347
-56
lines changed

13 files changed

+347
-56
lines changed

src/sst/core/link.cc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@
3333

3434
namespace SST {
3535

36+
bool Link::is_restart_same_parallelism = false;
37+
3638
void
3739
SST::Core::Serialization::serialize_impl<Link*>::serialize_events(
3840
serializer& ser, uintptr_t delivery_info, ActivityQueue* queue)
@@ -270,8 +272,9 @@ SST::Core::Serialization::serialize_impl<Link*>::operator()(Link*& s, serializer
270272
if ( pair_restart_rank.rank == RankInfo::UNASSIGNED ) pair_restart_rank = pair_rank;
271273
}
272274

273-
bool is_restart_sync = (my_restart_rank != pair_restart_rank);
274-
275+
// Need to know if this link is a sync link or not. We can only do this if the restart uses the exact
276+
// rank/thread counts as the checkpoint. This info is stored in Link::is_restart_same_parallelism
277+
bool is_restart_sync = (Link::is_restart_same_parallelism && my_restart_rank != pair_restart_rank);
275278

276279
/*
277280
Create or get link from tracker

src/sst/core/link.h

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -316,27 +316,6 @@ class alignas(64) Link
316316
*/
317317
bool isConfigured() { return type != UNINITIALIZED; }
318318

319-
/**
320-
Get the latency on the link in units of core atomic time base
321-
322-
NOTE: This is a core only API and not part of the public stable API
323-
*/
324-
SimTime_t getLatency() { return latency; }
325-
326-
/**
327-
Get the delivery_info for the link
328-
329-
NOTE: This is a core only API and not part of the public stable API
330-
*/
331-
uintptr_t getDeliveryInfo() { return delivery_info; }
332-
333-
/**
334-
Get the pair_link
335-
336-
NOTE: This is a core only API and not part of the public stable API
337-
*/
338-
Link* getPairLink() { return pair_link; }
339-
340319

341320
#ifdef __SST_DEBUG_EVENT_TRACKING__
342321
void setSendingComponentInfo(const std::string& comp_in, const std::string& type_in, const std::string& port_in)
@@ -379,6 +358,23 @@ class alignas(64) Link
379358
// else pair_link->tag = 0x80000000 | new_tag;
380359
}
381360

361+
/**
362+
Get the latency on the link in units of core atomic time base
363+
364+
NOTE: This is a core only API and not part of the public stable API
365+
*/
366+
SimTime_t getLatency() { return latency; }
367+
368+
/**
369+
Get the delivery_info for the link
370+
*/
371+
uintptr_t getDeliveryInfo() { return delivery_info; }
372+
373+
/**
374+
Get the pair_link
375+
*/
376+
Link* getPairLink() { return pair_link; }
377+
382378
/**
383379
Sends an Event over a Link with an additional delay specified with a TimeConverter. I.e. the total delay is the
384380
Link's delay + the additional specified delay.
@@ -402,6 +398,12 @@ class alignas(64) Link
402398
event->updateDeliveryInfo(delivery_info);
403399
}
404400

401+
/**
402+
Variable used by Link restarts to know whether stored rank data for remote links is still valid (i.e. did we
403+
restart with the exact same rank/thread count or not).
404+
*/
405+
static bool is_restart_same_parallelism;
406+
405407
// Since Links are found in pairs, I will keep all the information
406408
// needed for me to send and deliver an event to the other side of
407409
// the Link. That means, that I mostly keep my pair's

src/sst/core/main.cc

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -518,7 +518,7 @@ start_simulation(uint32_t tid, SimThreadInfo_t& info, Core::ThreadSafe::Barrier&
518518
if ( restart ) {
519519

520520
// Finish parsing checkpoint for restart
521-
sim->restart();
521+
sim->restart(info.graph);
522522

523523
barrier.wait();
524524

@@ -528,6 +528,17 @@ start_simulation(uint32_t tid, SimThreadInfo_t& info, Core::ThreadSafe::Barrier&
528528

529529
barrier.wait();
530530

531+
// Need to detect the sync intervals
532+
if ( info.myRank.thread == 0 ) {
533+
sim->findRankSyncInterval();
534+
}
535+
536+
barrier.wait();
537+
538+
sim->findThreadSyncInterval();
539+
540+
barrier.wait();
541+
531542
} // if ( restart )
532543

533544

src/sst/core/model/configGraph.cc

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -627,12 +627,13 @@ ConfigGraph::splitGraph(const std::set<uint32_t>& orig_rank_set, const std::set<
627627
graph = new ConfigGraph();
628628

629629
// Need to copy over any restart data
630-
graph->cpt_ranks = cpt_ranks;
631-
graph->cpt_currentSimCycle = cpt_currentSimCycle;
632-
graph->cpt_currentPriority = cpt_currentPriority;
633-
graph->cpt_minPart = cpt_minPart;
634-
graph->cpt_minPartTC = cpt_minPartTC;
635-
graph->cpt_max_event_id = cpt_max_event_id;
630+
graph->cpt_ranks = cpt_ranks;
631+
graph->cpt_currentSimCycle = cpt_currentSimCycle;
632+
graph->cpt_currentPriority = cpt_currentPriority;
633+
graph->cpt_minPart = cpt_minPart;
634+
graph->cpt_minPartTC = cpt_minPartTC;
635+
graph->cpt_max_event_id = cpt_max_event_id;
636+
graph->cpt_remap_partitions = cpt_remap_partitions;
636637

637638
graph->cpt_libnames = cpt_libnames;
638639
graph->cpt_shared_objects = cpt_shared_objects;

src/sst/core/model/configGraph.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,7 @@ class ConfigGraph : public SST::Core::Serialization::serializable
211211
SST_SER(cpt_minPart);
212212
SST_SER(cpt_minPartTC);
213213
SST_SER(cpt_max_event_id);
214+
SST_SER(cpt_remap_partitions);
214215

215216
SST_SER(*(cpt_libnames.get()));
216217
SST_SER(*(cpt_shared_objects.get()));
@@ -225,7 +226,8 @@ class ConfigGraph : public SST::Core::Serialization::serializable
225226
int cpt_currentPriority = 0;
226227
SimTime_t cpt_minPart = std::numeric_limits<SimTime_t>::max();
227228
TimeConverter cpt_minPartTC;
228-
uint64_t cpt_max_event_id = 0;
229+
uint64_t cpt_max_event_id = 0;
230+
bool cpt_remap_partitions = false;
229231

230232
std::shared_ptr<std::set<std::string>> cpt_libnames = std::make_shared<std::set<std::string>>();
231233
std::shared_ptr<std::vector<char>> cpt_shared_objects = std::make_shared<std::vector<char>>();

src/sst/core/model/restart/sstcptmodel.cc

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -121,11 +121,17 @@ SSTCPTModelDefinition::createConfigGraph()
121121
if ( (cfg.num_ranks() != graph->cpt_ranks.rank || cfg.num_threads() != graph->cpt_ranks.thread) &&
122122
!(cfg.num_threads() == 1 && cfg.num_ranks() == 1) ) {
123123

124-
Output::getDefaultObject().fatal(CALL_INFO, 1,
125-
"Rank or thread counts do not match checkpoint. "
126-
"Checkpoint requires %" PRIu32 " ranks and %" PRIu32 " threads. "
127-
"Serial restarts are also permitted.\n",
128-
graph->cpt_ranks.rank, graph->cpt_ranks.thread);
124+
// We can proceed if the total number of partitions is the same, otherwise, error
125+
if ( (cfg.num_ranks() * cfg.num_threads()) == (graph->cpt_ranks.rank * graph->cpt_ranks.thread) ) {
126+
graph->cpt_remap_partitions = true;
127+
}
128+
else {
129+
Output::getDefaultObject().fatal(CALL_INFO, 1,
130+
"Rank or thread counts do not match checkpoint. Checkpoint/restart requires that the total parallelism "
131+
"be the same between a checkpoint and restart (i.e. ranks * threads is the same for both). Checkpoint "
132+
"was created with %" PRIu32 " ranks and %" PRIu32 " threads. Serial restarts are also permitted.\n",
133+
graph->cpt_ranks.rank, graph->cpt_ranks.thread);
134+
}
129135
}
130136
/******** ^^ Works for regular and N->1 restart ^^ ***********/
131137

0 commit comments

Comments
 (0)