Skip to content

Commit 18b50f8

Browse files
authored
improve clock skew calculations and reporting (#24130)
1 parent 4d56ef2 commit 18b50f8

File tree

3 files changed

+159
-86
lines changed

3 files changed

+159
-86
lines changed

ydb/core/health_check/health_check.cpp

Lines changed: 124 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,8 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
149149
TabletState,
150150
SystemTabletState,
151151
OverloadState,
152-
SyncState,
152+
NodeClockSkew,
153+
DatabaseClockSkew,
153154
Uptime,
154155
QuotaUsage,
155156
BridgeGroupState,
@@ -268,9 +269,11 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
268269
ui64 StorageQuota = 0;
269270
ui64 StorageUsage = 0;
270271
TMaybeServerlessComputeResourcesMode ServerlessComputeResourcesMode;
271-
TNodeId MaxTimeDifferenceNodeId = 0;
272272
TString Path;
273273
THashMap<TString, TVector<TNodeId>> PileNodeIds;
274+
ui64 MaxClockSkewUs = 0; // maximum clock skew between database nodes
275+
TNodeId MaxClockSkewNodeId = 0; // node id reported by most of the other nodes
276+
i64 MaxClockSkewNodeAvgUs = 0; // average clock skew reported by most of the other nodes
274277
};
275278

276279
struct TGroupState {
@@ -665,6 +668,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
665668
std::optional<TRequestResponse<TEvConsole::TEvListTenantsResponse>> ListTenants;
666669
std::optional<TRequestResponse<TEvInterconnect::TEvNodesInfo>> NodesInfo;
667670
THashMap<TNodeId, const TEvInterconnect::TNodeInfo*> MergedNodeInfo;
671+
std::vector<std::unordered_set<TNodeId>> PileNumToNodeIds;
668672
std::optional<TRequestResponse<TEvSysView::TEvGetStoragePoolsResponse>> StoragePools;
669673
std::optional<TRequestResponse<TEvSysView::TEvGetGroupsResponse>> Groups;
670674
std::optional<TRequestResponse<TEvSysView::TEvGetVSlotsResponse>> VSlots;
@@ -1469,6 +1473,14 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
14691473
RequestComputeNode(ni.NodeId);
14701474
}
14711475
}
1476+
if (NodesInfo->Get()->PileMap) {
1477+
for (size_t pileNum = 0; pileNum < NodesInfo->Get()->PileMap->size(); ++pileNum) {
1478+
PileNumToNodeIds.emplace_back();
1479+
for (TNodeId nodeId : (*NodesInfo->Get()->PileMap)[pileNum]) {
1480+
PileNumToNodeIds[pileNum].insert(nodeId);
1481+
}
1482+
}
1483+
}
14721484
RequestDone("TEvNodesInfo");
14731485
}
14741486

@@ -2103,30 +2115,19 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
21032115
loadAverageStatus.set_overall(laContext.GetOverallStatus());
21042116
}
21052117

2106-
if (nodeSystemState.HasMaxClockSkewPeerId()) {
2107-
TNodeId peerId = nodeSystemState.GetMaxClockSkewPeerId();
2108-
long timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs();
2109-
TDuration timeDifferenceDuration = TDuration::MicroSeconds(abs(timeDifferenceUs));
2110-
Ydb::Monitoring::StatusFlag::Status status;
2111-
if (timeDifferenceDuration > TDuration::MicroSeconds(HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceOrange())) {
2112-
status = Ydb::Monitoring::StatusFlag::ORANGE;
2113-
} else if (timeDifferenceDuration > TDuration::MicroSeconds(HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceYellow())) {
2114-
status = Ydb::Monitoring::StatusFlag::YELLOW;
2118+
if (databaseState.MaxClockSkewNodeId == nodeId) {
2119+
TSelfCheckContext tdContext(&context, "CLOCK_SKEW");
2120+
Ydb::Monitoring::ClockSkewStatus& clockSkewStatus = *computeNodeStatus.mutable_clock_skew();
2121+
ui64 clockSkew = abs(databaseState.MaxClockSkewNodeAvgUs);
2122+
clockSkewStatus.set_clock_skew(-databaseState.MaxClockSkewNodeAvgUs / 1000); // in ms
2123+
if (clockSkew >= HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceOrange()) {
2124+
tdContext.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Clock skew exceeds threshold", ETags::NodeClockSkew);
2125+
} else if (clockSkew >= HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceYellow()) {
2126+
tdContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Clock skew above recommended limit", ETags::NodeClockSkew);
21152127
} else {
2116-
status = Ydb::Monitoring::StatusFlag::GREEN;
2117-
}
2118-
2119-
if (databaseState.MaxTimeDifferenceNodeId == nodeId) {
2120-
TSelfCheckContext tdContext(&context, "NODES_TIME_DIFFERENCE");
2121-
if (status == Ydb::Monitoring::StatusFlag::GREEN) {
2122-
tdContext.ReportStatus(status);
2123-
} else {
2124-
tdContext.ReportStatus(status, TStringBuilder() << "Node is "
2125-
<< timeDifferenceDuration.MilliSeconds() << " ms "
2126-
<< (timeDifferenceUs > 0 ? "behind " : "ahead of ")
2127-
<< "peer [" << peerId << "]", ETags::SyncState);
2128-
}
2128+
tdContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
21292129
}
2130+
clockSkewStatus.set_overall(tdContext.GetOverallStatus());
21302131
}
21312132
} else {
21322133
// context.ReportStatus(Ydb::Monitoring::StatusFlag::RED,
@@ -2137,7 +2138,29 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
21372138
computeNodeStatus.set_overall(context.GetOverallStatus());
21382139
}
21392140

2140-
void FillComputeDatabaseStatus(TDatabaseState& databaseState, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
2141+
void FillComputePileStatus(TDatabaseState& databaseState, size_t pileNum, const TString& pileName,
2142+
const TVector<TNodeId>& computeNodeIds, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
2143+
// temporary solution - it's better to rework the concept of "piles" in health check
2144+
if (PileNumToNodeIds.size() <= pileNum) {
2145+
return;
2146+
}
2147+
context.Location.mutable_compute()->mutable_pile()->set_name(pileName);
2148+
const auto& pileNodes(PileNumToNodeIds[pileNum]);
2149+
for (TNodeId nodeId : computeNodeIds) {
2150+
if (pileNodes.count(nodeId) == 0) {
2151+
continue;
2152+
}
2153+
auto& computeNode = *computeStatus.add_nodes();
2154+
FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"});
2155+
2156+
}
2157+
context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::PileComputeState, {ETags::Uptime});
2158+
context.ReportWithMaxChildStatus("Compute is overloaded", ETags::PileComputeState, {ETags::OverloadState});
2159+
context.ReportWithMaxChildStatus("Compute quota usage", ETags::PileComputeState, {ETags::QuotaUsage});
2160+
context.ReportWithMaxChildStatus("Clock skew issues", ETags::PileComputeState, {ETags::NodeClockSkew});
2161+
}
2162+
2163+
void FillComputeDatabaseQuota(TDatabaseState& databaseState, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
21412164
auto itDescribe = DescribeByPath.find(databaseState.Path);
21422165
if (itDescribe != DescribeByPath.end() && itDescribe->second.IsOk()) {
21432166
const auto& domain(itDescribe->second->GetRecord().GetPathDescription().GetDomainDescription());
@@ -2170,13 +2193,54 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
21702193
}
21712194
}
21722195

2196+
void FillComputeDatabaseClockSkew(TDatabaseState& databaseState, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
2197+
ui64 clockSkew = databaseState.MaxClockSkewUs;
2198+
if (clockSkew >= HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceOrange()) {
2199+
context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Clock skew exceeds threshold", ETags::DatabaseClockSkew, {ETags::NodeClockSkew});
2200+
} else if (clockSkew >= HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceYellow()) {
2201+
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Clock skew above recommended limit", ETags::DatabaseClockSkew, {ETags::NodeClockSkew});
2202+
} else {
2203+
context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
2204+
}
2205+
computeStatus.mutable_clock_skew()->set_clock_skew(clockSkew / 1000); // in ms
2206+
computeStatus.mutable_clock_skew()->set_overall(context.GetOverallStatus());
2207+
}
2208+
2209+
struct TNodeClockSkewState {
2210+
TNodeId NodeId = 0;
2211+
int NumberOfReporters = 0;
2212+
i64 SumClockSkewUs = 0;
2213+
};
2214+
2215+
void CalculateClockSkewState(TDatabaseState& databaseState, const TVector<TNodeId>& computeNodes) {
2216+
std::unordered_map<TNodeId, TNodeClockSkewState> nodeClockSkewState;
2217+
for (TNodeId nodeId : computeNodes) {
2218+
auto itNodeSystemState = MergedNodeSystemState.find(nodeId);
2219+
if (itNodeSystemState != MergedNodeSystemState.end()) {
2220+
const NKikimrWhiteboard::TSystemStateInfo& nodeSystemState(*itNodeSystemState->second);
2221+
if (nodeSystemState.HasMaxClockSkewPeerId()) {
2222+
TNodeId peerId = nodeSystemState.GetMaxClockSkewPeerId();
2223+
i64 timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs();
2224+
auto& clockSkewState = nodeClockSkewState[peerId];
2225+
clockSkewState.NumberOfReporters++;
2226+
clockSkewState.SumClockSkewUs += timeDifferenceUs;
2227+
databaseState.MaxClockSkewUs = std::max<ui64>(databaseState.MaxClockSkewUs, abs(timeDifferenceUs));
2228+
}
2229+
}
2230+
}
2231+
auto itMaxClockSkew = std::ranges::max_element(nodeClockSkewState, [](const auto& a, const auto& b) {
2232+
return a.second.NumberOfReporters > b.second.NumberOfReporters;
2233+
});
2234+
if (itMaxClockSkew != nodeClockSkewState.end()) {
2235+
if (itMaxClockSkew->second.NumberOfReporters * 2 >= static_cast<int>(nodeClockSkewState.size())) { // at least 50% of reporters
2236+
databaseState.MaxClockSkewNodeId = itMaxClockSkew->first;
2237+
databaseState.MaxClockSkewNodeAvgUs = itMaxClockSkew->second.SumClockSkewUs / itMaxClockSkew->second.NumberOfReporters;
2238+
}
2239+
}
2240+
}
2241+
21732242
void FillCompute(TDatabaseState& databaseState, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
21742243
TVector<TNodeId>* computeNodeIds = &databaseState.ComputeNodeIds;
2175-
auto report = [](TSelfCheckContext& context, ETags tag) {
2176-
context.ReportWithMaxChildStatus("Some nodes are restarting too often", tag, {ETags::Uptime});
2177-
context.ReportWithMaxChildStatus("Compute is overloaded", tag, {ETags::OverloadState});
2178-
context.ReportWithMaxChildStatus("Compute quota usage", tag, {ETags::QuotaUsage});
2179-
};
21802244
if (databaseState.ResourcePathId
21812245
&& databaseState.ServerlessComputeResourcesMode != NKikimrSubDomains::EServerlessComputeResourcesModeExclusive)
21822246
{
@@ -2189,67 +2253,50 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
21892253
}
21902254
std::sort(computeNodeIds->begin(), computeNodeIds->end());
21912255
computeNodeIds->erase(std::unique(computeNodeIds->begin(), computeNodeIds->end()), computeNodeIds->end());
2192-
bool bridgeMode = NodeWardenStorageConfig && NodeWardenStorageConfig->Get()->BridgeInfo;
2256+
bool bridgeMode = NodeWardenStorageConfig && NodeWardenStorageConfig->IsOk() && NodeWardenStorageConfig->Get()->BridgeInfo
2257+
&& !NodeWardenStorageConfig->Get()->BridgeInfo->Piles.empty();
21932258
if (computeNodeIds->empty()) {
21942259
context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "There are no compute nodes", ETags::ComputeState);
21952260
} else {
2196-
std::vector<TString> activePiles = {""};
2197-
std::unordered_map<TString, TSelfCheckContext> pileContext;
21982261
if (bridgeMode) {
2199-
for (size_t i = 0; i < AppData()->BridgeConfig.PilesSize(); ++i) {
2200-
const auto& pile = NodeWardenStorageConfig->Get()->BridgeInfo->Piles[i];
2201-
const auto& pileName = AppData()->BridgeConfig.GetPiles(i).GetName();
2202-
auto [it, _] = pileContext.try_emplace(pileName, &context, "BRIDGE_PILE");
2203-
it->second.Location.mutable_compute()->mutable_pile()->set_name(pileName);
2204-
if (pile.IsPrimary || pile.IsBeingPromoted) {
2205-
activePiles.push_back(pileName);
2262+
const auto& piles(NodeWardenStorageConfig->Get()->BridgeInfo->Piles);
2263+
TVector<TNodeId> activePileNodeIds;
2264+
for (size_t pileNum = 0; pileNum < piles.size(); ++pileNum) {
2265+
if (piles[pileNum].IsPrimary || piles[pileNum].IsBeingPromoted) {
2266+
if (PileNumToNodeIds.size() <= pileNum) {
2267+
continue;
2268+
}
2269+
const auto& pileNodes(PileNumToNodeIds[pileNum]);
2270+
for (TNodeId nodeId : *computeNodeIds) {
2271+
if (pileNodes.count(nodeId) > 0) {
2272+
activePileNodeIds.push_back(nodeId);
2273+
}
2274+
}
22062275
}
22072276
}
2208-
}
2209-
auto getContext = [&](const TString& pileName) -> TSelfCheckContext* {
2210-
auto it = pileContext.find(pileName);
2211-
if (it == pileContext.end()) {
2212-
return &context;
2213-
} else {
2214-
return &it->second;
2215-
}
2216-
};
2217-
long maxTimeDifferenceUs = 0;
2218-
for (TNodeId nodeId : *computeNodeIds) {
2219-
auto itNodeSystemState = MergedNodeSystemState.find(nodeId);
2220-
if (itNodeSystemState != MergedNodeSystemState.end()) {
2221-
const TString& pileName = itNodeSystemState->second->GetLocation().GetBridgePileName();
2222-
if (std::count(activePiles.begin(), activePiles.end(), pileName) > 0
2223-
&& abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()) > maxTimeDifferenceUs) {
2224-
maxTimeDifferenceUs = abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs());
2225-
databaseState.MaxTimeDifferenceNodeId = nodeId;
2226-
}
2277+
CalculateClockSkewState(databaseState, activePileNodeIds);
2278+
for (size_t pileNum = 0; pileNum < piles.size(); ++pileNum) {
2279+
FillComputePileStatus(databaseState, pileNum, piles[pileNum].Name, *computeNodeIds, computeStatus, {&context, "BRIDGE_PILE"});
22272280
}
2228-
}
2229-
for (TNodeId nodeId : *computeNodeIds) {
2230-
auto itNodeSystemState = MergedNodeSystemState.find(nodeId);
2231-
TString pileName;
2232-
if (itNodeSystemState != MergedNodeSystemState.end()) {
2233-
pileName = itNodeSystemState->second->GetLocation().GetBridgePileName();
2281+
context.ReportWithMaxChildStatus("There are compute issues", ETags::ComputeState, {ETags::PileComputeState});
2282+
} else {
2283+
CalculateClockSkewState(databaseState, *computeNodeIds);
2284+
for (TNodeId nodeId : *computeNodeIds) {
2285+
auto& computeNode = *computeStatus.add_nodes();
2286+
FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"});
22342287
}
2235-
auto& computeNode = *computeStatus.add_nodes();
2236-
FillComputeNodeStatus(databaseState, nodeId, computeNode, {getContext(pileName), "COMPUTE_NODE"});
2237-
}
2238-
for (auto& [_, ctx] : pileContext) {
2239-
report(ctx, ETags::PileComputeState);
2288+
context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::ComputeState, {ETags::Uptime});
2289+
context.ReportWithMaxChildStatus("Compute is overloaded", ETags::ComputeState, {ETags::OverloadState});
2290+
context.ReportWithMaxChildStatus("Compute quota usage", ETags::ComputeState, {ETags::QuotaUsage});
2291+
context.ReportWithMaxChildStatus("Clock skew issues", ETags::ComputeState, {ETags::NodeClockSkew});
22402292
}
22412293
}
22422294
Ydb::Monitoring::StatusFlag::Status systemStatus = FillSystemTablets(databaseState, {&context, "SYSTEM_TABLET"});
22432295
if (systemStatus != Ydb::Monitoring::StatusFlag::GREEN && systemStatus != Ydb::Monitoring::StatusFlag::GREY) {
22442296
context.ReportStatus(systemStatus, "Compute has issues with system tablets", ETags::ComputeState, {ETags::SystemTabletState});
22452297
}
2246-
FillComputeDatabaseStatus(databaseState, computeStatus, {&context, "COMPUTE_QUOTA"});
2247-
if (bridgeMode) {
2248-
context.ReportWithMaxChildStatus("There are compute issues", ETags::ComputeState, {ETags::PileComputeState});
2249-
} else {
2250-
report(context, ETags::ComputeState);
2251-
}
2252-
context.ReportWithMaxChildStatus("Database has time difference between nodes", ETags::ComputeState, {ETags::SyncState});
2298+
FillComputeDatabaseQuota(databaseState, computeStatus, {&context, "COMPUTE_QUOTA"});
2299+
FillComputeDatabaseClockSkew(databaseState, computeStatus, {&context, "CLOCK_SKEW"});
22532300
Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN;
22542301
computeNodeIds->push_back(0); // for tablets without node
22552302
for (TNodeId nodeId : *computeNodeIds) {

0 commit comments

Comments
 (0)