@@ -149,7 +149,8 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
149
149
TabletState,
150
150
SystemTabletState,
151
151
OverloadState,
152
- SyncState,
152
+ NodeClockSkew,
153
+ DatabaseClockSkew,
153
154
Uptime,
154
155
QuotaUsage,
155
156
BridgeGroupState,
@@ -268,9 +269,11 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
268
269
ui64 StorageQuota = 0 ;
269
270
ui64 StorageUsage = 0 ;
270
271
TMaybeServerlessComputeResourcesMode ServerlessComputeResourcesMode;
271
- TNodeId MaxTimeDifferenceNodeId = 0 ;
272
272
TString Path;
273
273
THashMap<TString, TVector<TNodeId>> PileNodeIds;
274
+ ui64 MaxClockSkewUs = 0 ; // maximum clock skew between database nodes
275
+ TNodeId MaxClockSkewNodeId = 0 ; // node id reported by most of the other nodes
276
+ i64 MaxClockSkewNodeAvgUs = 0 ; // average clock skew reported by most of the other nodes
274
277
};
275
278
276
279
struct TGroupState {
@@ -665,6 +668,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
665
668
std::optional<TRequestResponse<TEvConsole::TEvListTenantsResponse>> ListTenants;
666
669
std::optional<TRequestResponse<TEvInterconnect::TEvNodesInfo>> NodesInfo;
667
670
THashMap<TNodeId, const TEvInterconnect::TNodeInfo*> MergedNodeInfo;
671
+ std::vector<std::unordered_set<TNodeId>> PileNumToNodeIds;
668
672
std::optional<TRequestResponse<TEvSysView::TEvGetStoragePoolsResponse>> StoragePools;
669
673
std::optional<TRequestResponse<TEvSysView::TEvGetGroupsResponse>> Groups;
670
674
std::optional<TRequestResponse<TEvSysView::TEvGetVSlotsResponse>> VSlots;
@@ -1469,6 +1473,14 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
1469
1473
RequestComputeNode (ni.NodeId );
1470
1474
}
1471
1475
}
1476
+ if (NodesInfo->Get ()->PileMap ) {
1477
+ for (size_t pileNum = 0 ; pileNum < NodesInfo->Get ()->PileMap ->size (); ++pileNum) {
1478
+ PileNumToNodeIds.emplace_back ();
1479
+ for (TNodeId nodeId : (*NodesInfo->Get ()->PileMap )[pileNum]) {
1480
+ PileNumToNodeIds[pileNum].insert (nodeId);
1481
+ }
1482
+ }
1483
+ }
1472
1484
RequestDone (" TEvNodesInfo" );
1473
1485
}
1474
1486
@@ -2103,30 +2115,19 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
2103
2115
loadAverageStatus.set_overall (laContext.GetOverallStatus ());
2104
2116
}
2105
2117
2106
- if (nodeSystemState. HasMaxClockSkewPeerId () ) {
2107
- TNodeId peerId = nodeSystemState. GetMaxClockSkewPeerId ( );
2108
- long timeDifferenceUs = nodeSystemState. GetMaxClockSkewWithPeerUs ();
2109
- TDuration timeDifferenceDuration = TDuration::MicroSeconds ( abs (timeDifferenceUs) );
2110
- Ydb::Monitoring::StatusFlag::Status status;
2111
- if (timeDifferenceDuration > TDuration::MicroSeconds ( HealthCheckConfig.GetThresholds ().GetNodesTimeDifferenceOrange () )) {
2112
- status = Ydb::Monitoring::StatusFlag::ORANGE;
2113
- } else if (timeDifferenceDuration > TDuration::MicroSeconds ( HealthCheckConfig.GetThresholds ().GetNodesTimeDifferenceYellow () )) {
2114
- status = Ydb::Monitoring::StatusFlag::YELLOW;
2118
+ if (databaseState. MaxClockSkewNodeId == nodeId ) {
2119
+ TSelfCheckContext tdContext (&context, " CLOCK_SKEW " );
2120
+ Ydb::Monitoring::ClockSkewStatus& clockSkewStatus = *computeNodeStatus. mutable_clock_skew ();
2121
+ ui64 clockSkew = abs (databaseState. MaxClockSkewNodeAvgUs );
2122
+ clockSkewStatus. set_clock_skew (-databaseState. MaxClockSkewNodeAvgUs / 1000 ); // in ms
2123
+ if (clockSkew >= HealthCheckConfig.GetThresholds ().GetNodesTimeDifferenceOrange ()) {
2124
+ tdContext. ReportStatus ( Ydb::Monitoring::StatusFlag::ORANGE, " Clock skew exceeds threshold " , ETags::NodeClockSkew) ;
2125
+ } else if (clockSkew >= HealthCheckConfig.GetThresholds ().GetNodesTimeDifferenceYellow ()) {
2126
+ tdContext. ReportStatus ( Ydb::Monitoring::StatusFlag::YELLOW, " Clock skew above recommended limit " , ETags::NodeClockSkew) ;
2115
2127
} else {
2116
- status = Ydb::Monitoring::StatusFlag::GREEN;
2117
- }
2118
-
2119
- if (databaseState.MaxTimeDifferenceNodeId == nodeId) {
2120
- TSelfCheckContext tdContext (&context, " NODES_TIME_DIFFERENCE" );
2121
- if (status == Ydb::Monitoring::StatusFlag::GREEN) {
2122
- tdContext.ReportStatus (status);
2123
- } else {
2124
- tdContext.ReportStatus (status, TStringBuilder () << " Node is "
2125
- << timeDifferenceDuration.MilliSeconds () << " ms "
2126
- << (timeDifferenceUs > 0 ? " behind " : " ahead of " )
2127
- << " peer [" << peerId << " ]" , ETags::SyncState);
2128
- }
2128
+ tdContext.ReportStatus (Ydb::Monitoring::StatusFlag::GREEN);
2129
2129
}
2130
+ clockSkewStatus.set_overall (tdContext.GetOverallStatus ());
2130
2131
}
2131
2132
} else {
2132
2133
// context.ReportStatus(Ydb::Monitoring::StatusFlag::RED,
@@ -2137,7 +2138,29 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
2137
2138
computeNodeStatus.set_overall (context.GetOverallStatus ());
2138
2139
}
2139
2140
2140
- void FillComputeDatabaseStatus (TDatabaseState& databaseState, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
2141
+ void FillComputePileStatus (TDatabaseState& databaseState, size_t pileNum, const TString& pileName,
2142
+ const TVector<TNodeId>& computeNodeIds, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
2143
+ // temporary solution - it's better to rework the concept of "piles" in health check
2144
+ if (PileNumToNodeIds.size () <= pileNum) {
2145
+ return ;
2146
+ }
2147
+ context.Location .mutable_compute ()->mutable_pile ()->set_name (pileName);
2148
+ const auto & pileNodes (PileNumToNodeIds[pileNum]);
2149
+ for (TNodeId nodeId : computeNodeIds) {
2150
+ if (pileNodes.count (nodeId) == 0 ) {
2151
+ continue ;
2152
+ }
2153
+ auto & computeNode = *computeStatus.add_nodes ();
2154
+ FillComputeNodeStatus (databaseState, nodeId, computeNode, {&context, " COMPUTE_NODE" });
2155
+
2156
+ }
2157
+ context.ReportWithMaxChildStatus (" Some nodes are restarting too often" , ETags::PileComputeState, {ETags::Uptime});
2158
+ context.ReportWithMaxChildStatus (" Compute is overloaded" , ETags::PileComputeState, {ETags::OverloadState});
2159
+ context.ReportWithMaxChildStatus (" Compute quota usage" , ETags::PileComputeState, {ETags::QuotaUsage});
2160
+ context.ReportWithMaxChildStatus (" Clock skew issues" , ETags::PileComputeState, {ETags::NodeClockSkew});
2161
+ }
2162
+
2163
+ void FillComputeDatabaseQuota (TDatabaseState& databaseState, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
2141
2164
auto itDescribe = DescribeByPath.find (databaseState.Path );
2142
2165
if (itDescribe != DescribeByPath.end () && itDescribe->second .IsOk ()) {
2143
2166
const auto & domain (itDescribe->second ->GetRecord ().GetPathDescription ().GetDomainDescription ());
@@ -2170,13 +2193,54 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
2170
2193
}
2171
2194
}
2172
2195
2196
+ void FillComputeDatabaseClockSkew (TDatabaseState& databaseState, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
2197
+ ui64 clockSkew = databaseState.MaxClockSkewUs ;
2198
+ if (clockSkew >= HealthCheckConfig.GetThresholds ().GetNodesTimeDifferenceOrange ()) {
2199
+ context.ReportStatus (Ydb::Monitoring::StatusFlag::ORANGE, " Clock skew exceeds threshold" , ETags::DatabaseClockSkew, {ETags::NodeClockSkew});
2200
+ } else if (clockSkew >= HealthCheckConfig.GetThresholds ().GetNodesTimeDifferenceYellow ()) {
2201
+ context.ReportStatus (Ydb::Monitoring::StatusFlag::YELLOW, " Clock skew above recommended limit" , ETags::DatabaseClockSkew, {ETags::NodeClockSkew});
2202
+ } else {
2203
+ context.ReportStatus (Ydb::Monitoring::StatusFlag::GREEN);
2204
+ }
2205
+ computeStatus.mutable_clock_skew ()->set_clock_skew (clockSkew / 1000 ); // in ms
2206
+ computeStatus.mutable_clock_skew ()->set_overall (context.GetOverallStatus ());
2207
+ }
2208
+
2209
+ struct TNodeClockSkewState {
2210
+ TNodeId NodeId = 0 ;
2211
+ int NumberOfReporters = 0 ;
2212
+ i64 SumClockSkewUs = 0 ;
2213
+ };
2214
+
2215
+ void CalculateClockSkewState (TDatabaseState& databaseState, const TVector<TNodeId>& computeNodes) {
2216
+ std::unordered_map<TNodeId, TNodeClockSkewState> nodeClockSkewState;
2217
+ for (TNodeId nodeId : computeNodes) {
2218
+ auto itNodeSystemState = MergedNodeSystemState.find (nodeId);
2219
+ if (itNodeSystemState != MergedNodeSystemState.end ()) {
2220
+ const NKikimrWhiteboard::TSystemStateInfo& nodeSystemState (*itNodeSystemState->second );
2221
+ if (nodeSystemState.HasMaxClockSkewPeerId ()) {
2222
+ TNodeId peerId = nodeSystemState.GetMaxClockSkewPeerId ();
2223
+ i64 timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs ();
2224
+ auto & clockSkewState = nodeClockSkewState[peerId];
2225
+ clockSkewState.NumberOfReporters ++;
2226
+ clockSkewState.SumClockSkewUs += timeDifferenceUs;
2227
+ databaseState.MaxClockSkewUs = std::max<ui64>(databaseState.MaxClockSkewUs , abs (timeDifferenceUs));
2228
+ }
2229
+ }
2230
+ }
2231
+ auto itMaxClockSkew = std::ranges::max_element (nodeClockSkewState, [](const auto & a, const auto & b) {
2232
+ return a.second .NumberOfReporters > b.second .NumberOfReporters ;
2233
+ });
2234
+ if (itMaxClockSkew != nodeClockSkewState.end ()) {
2235
+ if (itMaxClockSkew->second .NumberOfReporters * 2 >= static_cast <int >(nodeClockSkewState.size ())) { // at least 50% of reporters
2236
+ databaseState.MaxClockSkewNodeId = itMaxClockSkew->first ;
2237
+ databaseState.MaxClockSkewNodeAvgUs = itMaxClockSkew->second .SumClockSkewUs / itMaxClockSkew->second .NumberOfReporters ;
2238
+ }
2239
+ }
2240
+ }
2241
+
2173
2242
void FillCompute (TDatabaseState& databaseState, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
2174
2243
TVector<TNodeId>* computeNodeIds = &databaseState.ComputeNodeIds ;
2175
- auto report = [](TSelfCheckContext& context, ETags tag) {
2176
- context.ReportWithMaxChildStatus (" Some nodes are restarting too often" , tag, {ETags::Uptime});
2177
- context.ReportWithMaxChildStatus (" Compute is overloaded" , tag, {ETags::OverloadState});
2178
- context.ReportWithMaxChildStatus (" Compute quota usage" , tag, {ETags::QuotaUsage});
2179
- };
2180
2244
if (databaseState.ResourcePathId
2181
2245
&& databaseState.ServerlessComputeResourcesMode != NKikimrSubDomains::EServerlessComputeResourcesModeExclusive)
2182
2246
{
@@ -2189,67 +2253,50 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
2189
2253
}
2190
2254
std::sort (computeNodeIds->begin (), computeNodeIds->end ());
2191
2255
computeNodeIds->erase (std::unique (computeNodeIds->begin (), computeNodeIds->end ()), computeNodeIds->end ());
2192
- bool bridgeMode = NodeWardenStorageConfig && NodeWardenStorageConfig->Get ()->BridgeInfo ;
2256
+ bool bridgeMode = NodeWardenStorageConfig && NodeWardenStorageConfig->IsOk () && NodeWardenStorageConfig->Get ()->BridgeInfo
2257
+ && !NodeWardenStorageConfig->Get ()->BridgeInfo ->Piles .empty ();
2193
2258
if (computeNodeIds->empty ()) {
2194
2259
context.ReportStatus (Ydb::Monitoring::StatusFlag::RED, " There are no compute nodes" , ETags::ComputeState);
2195
2260
} else {
2196
- std::vector<TString> activePiles = {" " };
2197
- std::unordered_map<TString, TSelfCheckContext> pileContext;
2198
2261
if (bridgeMode) {
2199
- for (size_t i = 0 ; i < AppData ()->BridgeConfig .PilesSize (); ++i) {
2200
- const auto & pile = NodeWardenStorageConfig->Get ()->BridgeInfo ->Piles [i];
2201
- const auto & pileName = AppData ()->BridgeConfig .GetPiles (i).GetName ();
2202
- auto [it, _] = pileContext.try_emplace (pileName, &context, " BRIDGE_PILE" );
2203
- it->second .Location .mutable_compute ()->mutable_pile ()->set_name (pileName);
2204
- if (pile.IsPrimary || pile.IsBeingPromoted ) {
2205
- activePiles.push_back (pileName);
2262
+ const auto & piles (NodeWardenStorageConfig->Get ()->BridgeInfo ->Piles );
2263
+ TVector<TNodeId> activePileNodeIds;
2264
+ for (size_t pileNum = 0 ; pileNum < piles.size (); ++pileNum) {
2265
+ if (piles[pileNum].IsPrimary || piles[pileNum].IsBeingPromoted ) {
2266
+ if (PileNumToNodeIds.size () <= pileNum) {
2267
+ continue ;
2268
+ }
2269
+ const auto & pileNodes (PileNumToNodeIds[pileNum]);
2270
+ for (TNodeId nodeId : *computeNodeIds) {
2271
+ if (pileNodes.count (nodeId) > 0 ) {
2272
+ activePileNodeIds.push_back (nodeId);
2273
+ }
2274
+ }
2206
2275
}
2207
2276
}
2208
- }
2209
- auto getContext = [&](const TString& pileName) -> TSelfCheckContext* {
2210
- auto it = pileContext.find (pileName);
2211
- if (it == pileContext.end ()) {
2212
- return &context;
2213
- } else {
2214
- return &it->second ;
2215
- }
2216
- };
2217
- long maxTimeDifferenceUs = 0 ;
2218
- for (TNodeId nodeId : *computeNodeIds) {
2219
- auto itNodeSystemState = MergedNodeSystemState.find (nodeId);
2220
- if (itNodeSystemState != MergedNodeSystemState.end ()) {
2221
- const TString& pileName = itNodeSystemState->second ->GetLocation ().GetBridgePileName ();
2222
- if (std::count (activePiles.begin (), activePiles.end (), pileName) > 0
2223
- && abs (itNodeSystemState->second ->GetMaxClockSkewWithPeerUs ()) > maxTimeDifferenceUs) {
2224
- maxTimeDifferenceUs = abs (itNodeSystemState->second ->GetMaxClockSkewWithPeerUs ());
2225
- databaseState.MaxTimeDifferenceNodeId = nodeId;
2226
- }
2277
+ CalculateClockSkewState (databaseState, activePileNodeIds);
2278
+ for (size_t pileNum = 0 ; pileNum < piles.size (); ++pileNum) {
2279
+ FillComputePileStatus (databaseState, pileNum, piles[pileNum].Name , *computeNodeIds, computeStatus, {&context, " BRIDGE_PILE" });
2227
2280
}
2228
- }
2229
- for (TNodeId nodeId : *computeNodeIds) {
2230
- auto itNodeSystemState = MergedNodeSystemState. find (nodeId );
2231
- TString pileName;
2232
- if (itNodeSystemState != MergedNodeSystemState. end ()) {
2233
- pileName = itNodeSystemState-> second -> GetLocation (). GetBridgePileName ( );
2281
+ context. ReportWithMaxChildStatus ( " There are compute issues " , ETags::ComputeState, {ETags::PileComputeState});
2282
+ } else {
2283
+ CalculateClockSkewState (databaseState, *computeNodeIds );
2284
+ for (TNodeId nodeId : *computeNodeIds) {
2285
+ auto & computeNode = *computeStatus. add_nodes ();
2286
+ FillComputeNodeStatus (databaseState, nodeId, computeNode, {&context, " COMPUTE_NODE " } );
2234
2287
}
2235
- auto & computeNode = *computeStatus.add_nodes ();
2236
- FillComputeNodeStatus (databaseState, nodeId, computeNode, {getContext (pileName), " COMPUTE_NODE" });
2237
- }
2238
- for (auto & [_, ctx] : pileContext) {
2239
- report (ctx, ETags::PileComputeState);
2288
+ context.ReportWithMaxChildStatus (" Some nodes are restarting too often" , ETags::ComputeState, {ETags::Uptime});
2289
+ context.ReportWithMaxChildStatus (" Compute is overloaded" , ETags::ComputeState, {ETags::OverloadState});
2290
+ context.ReportWithMaxChildStatus (" Compute quota usage" , ETags::ComputeState, {ETags::QuotaUsage});
2291
+ context.ReportWithMaxChildStatus (" Clock skew issues" , ETags::ComputeState, {ETags::NodeClockSkew});
2240
2292
}
2241
2293
}
2242
2294
Ydb::Monitoring::StatusFlag::Status systemStatus = FillSystemTablets (databaseState, {&context, " SYSTEM_TABLET" });
2243
2295
if (systemStatus != Ydb::Monitoring::StatusFlag::GREEN && systemStatus != Ydb::Monitoring::StatusFlag::GREY) {
2244
2296
context.ReportStatus (systemStatus, " Compute has issues with system tablets" , ETags::ComputeState, {ETags::SystemTabletState});
2245
2297
}
2246
- FillComputeDatabaseStatus (databaseState, computeStatus, {&context, " COMPUTE_QUOTA" });
2247
- if (bridgeMode) {
2248
- context.ReportWithMaxChildStatus (" There are compute issues" , ETags::ComputeState, {ETags::PileComputeState});
2249
- } else {
2250
- report (context, ETags::ComputeState);
2251
- }
2252
- context.ReportWithMaxChildStatus (" Database has time difference between nodes" , ETags::ComputeState, {ETags::SyncState});
2298
+ FillComputeDatabaseQuota (databaseState, computeStatus, {&context, " COMPUTE_QUOTA" });
2299
+ FillComputeDatabaseClockSkew (databaseState, computeStatus, {&context, " CLOCK_SKEW" });
2253
2300
Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN;
2254
2301
computeNodeIds->push_back (0 ); // for tablets without node
2255
2302
for (TNodeId nodeId : *computeNodeIds) {
0 commit comments