ydb-platform
diff --git a/‎ydb/core/health_check/health_check.cpp‎
Lines changed: 124 additions & 77 deletions b/‎ydb/core/health_check/health_check.cpp‎
Lines changed: 124 additions & 77 deletions
@@ -149,7 +149,8 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
         TabletState,
         SystemTabletState,
         OverloadState,
-        SyncState,
+        NodeClockSkew,
+        DatabaseClockSkew,
         Uptime,
         QuotaUsage,
         BridgeGroupState,
@@ -268,9 +269,11 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
         ui64 StorageQuota = 0;
         ui64 StorageUsage = 0;
         TMaybeServerlessComputeResourcesMode ServerlessComputeResourcesMode;
-        TNodeId MaxTimeDifferenceNodeId = 0;
         TString Path;
         THashMap<TString, TVector<TNodeId>> PileNodeIds;
+        ui64 MaxClockSkewUs = 0; // maximum clock skew between database nodes
+        TNodeId MaxClockSkewNodeId = 0; // node id reported by most of the other nodes
+        i64 MaxClockSkewNodeAvgUs = 0; // average clock skew reported by most of the other nodes
     };
 
     struct TGroupState {
@@ -665,6 +668,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
     std::optional<TRequestResponse<TEvConsole::TEvListTenantsResponse>> ListTenants;
     std::optional<TRequestResponse<TEvInterconnect::TEvNodesInfo>> NodesInfo;
     THashMap<TNodeId, const TEvInterconnect::TNodeInfo*> MergedNodeInfo;
+    std::vector<std::unordered_set<TNodeId>> PileNumToNodeIds;
     std::optional<TRequestResponse<TEvSysView::TEvGetStoragePoolsResponse>> StoragePools;
     std::optional<TRequestResponse<TEvSysView::TEvGetGroupsResponse>> Groups;
     std::optional<TRequestResponse<TEvSysView::TEvGetVSlotsResponse>> VSlots;
@@ -1469,6 +1473,14 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
                 RequestComputeNode(ni.NodeId);
             }
         }
+        if (NodesInfo->Get()->PileMap) {
+            for (size_t pileNum = 0; pileNum < NodesInfo->Get()->PileMap->size(); ++pileNum) {
+                PileNumToNodeIds.emplace_back();
+                for (TNodeId nodeId : (*NodesInfo->Get()->PileMap)[pileNum]) {
+                    PileNumToNodeIds[pileNum].insert(nodeId);
+                }
+            }
+        }
         RequestDone("TEvNodesInfo");
     }
 
@@ -2103,30 +2115,19 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
                 loadAverageStatus.set_overall(laContext.GetOverallStatus());
             }
 
-            if (nodeSystemState.HasMaxClockSkewPeerId()) {
-                TNodeId peerId = nodeSystemState.GetMaxClockSkewPeerId();
-                long timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs();
-                TDuration timeDifferenceDuration = TDuration::MicroSeconds(abs(timeDifferenceUs));
-                Ydb::Monitoring::StatusFlag::Status status;
-                if (timeDifferenceDuration > TDuration::MicroSeconds(HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceOrange())) {
-                    status = Ydb::Monitoring::StatusFlag::ORANGE;
-                } else if (timeDifferenceDuration > TDuration::MicroSeconds(HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceYellow())) {
-                    status = Ydb::Monitoring::StatusFlag::YELLOW;
+            if (databaseState.MaxClockSkewNodeId == nodeId) {
+                TSelfCheckContext tdContext(&context, "CLOCK_SKEW");
+                Ydb::Monitoring::ClockSkewStatus& clockSkewStatus = *computeNodeStatus.mutable_clock_skew();
+                ui64 clockSkew = abs(databaseState.MaxClockSkewNodeAvgUs);
+                clockSkewStatus.set_clock_skew(-databaseState.MaxClockSkewNodeAvgUs / 1000); // in ms
+                if (clockSkew >= HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceOrange()) {
+                    tdContext.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Clock skew exceeds threshold", ETags::NodeClockSkew);
+                } else if (clockSkew >= HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceYellow()) {
+                    tdContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Clock skew above recommended limit", ETags::NodeClockSkew);
                 } else {
-                    status = Ydb::Monitoring::StatusFlag::GREEN;
-                }
-
-                if (databaseState.MaxTimeDifferenceNodeId == nodeId) {
-                    TSelfCheckContext tdContext(&context, "NODES_TIME_DIFFERENCE");
-                    if (status == Ydb::Monitoring::StatusFlag::GREEN) {
-                        tdContext.ReportStatus(status);
-                    } else {
-                        tdContext.ReportStatus(status, TStringBuilder() << "Node is  "
-                                                                        << timeDifferenceDuration.MilliSeconds() << " ms "
-                                                                        << (timeDifferenceUs > 0 ? "behind " : "ahead of ")
-                                                                        << "peer [" << peerId << "]", ETags::SyncState);
-                    }
+                    tdContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
                 }
+                clockSkewStatus.set_overall(tdContext.GetOverallStatus());
             }
         } else {
             // context.ReportStatus(Ydb::Monitoring::StatusFlag::RED,
@@ -2137,7 +2138,29 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
         computeNodeStatus.set_overall(context.GetOverallStatus());
     }
 
-    void FillComputeDatabaseStatus(TDatabaseState& databaseState, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
+    void FillComputePileStatus(TDatabaseState& databaseState, size_t pileNum, const TString& pileName,
+            const TVector<TNodeId>& computeNodeIds, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
+        // temporary solution - it's better to rework the concept of "piles" in health check
+        if (PileNumToNodeIds.size() <= pileNum) {
+            return;
+        }
+        context.Location.mutable_compute()->mutable_pile()->set_name(pileName);
+        const auto& pileNodes(PileNumToNodeIds[pileNum]);
+        for (TNodeId nodeId : computeNodeIds) {
+            if (pileNodes.count(nodeId) == 0) {
+                continue;
+            }
+            auto& computeNode = *computeStatus.add_nodes();
+            FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"});
+
+        }
+        context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::PileComputeState, {ETags::Uptime});
+        context.ReportWithMaxChildStatus("Compute is overloaded", ETags::PileComputeState, {ETags::OverloadState});
+        context.ReportWithMaxChildStatus("Compute quota usage", ETags::PileComputeState, {ETags::QuotaUsage});
+        context.ReportWithMaxChildStatus("Clock skew issues", ETags::PileComputeState, {ETags::NodeClockSkew});
+    }
+
+    void FillComputeDatabaseQuota(TDatabaseState& databaseState, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
         auto itDescribe = DescribeByPath.find(databaseState.Path);
         if (itDescribe != DescribeByPath.end() && itDescribe->second.IsOk()) {
             const auto& domain(itDescribe->second->GetRecord().GetPathDescription().GetDomainDescription());
@@ -2170,13 +2193,54 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
         }
     }
 
+    void FillComputeDatabaseClockSkew(TDatabaseState& databaseState, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
+        ui64 clockSkew = databaseState.MaxClockSkewUs;
+        if (clockSkew >= HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceOrange()) {
+            context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Clock skew exceeds threshold", ETags::DatabaseClockSkew, {ETags::NodeClockSkew});
+        } else if (clockSkew >= HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceYellow()) {
+            context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Clock skew above recommended limit", ETags::DatabaseClockSkew, {ETags::NodeClockSkew});
+        } else {
+            context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
+        }
+        computeStatus.mutable_clock_skew()->set_clock_skew(clockSkew / 1000); // in ms
+        computeStatus.mutable_clock_skew()->set_overall(context.GetOverallStatus());
+    }
+
+    struct TNodeClockSkewState {
+        TNodeId NodeId = 0;
+        int NumberOfReporters = 0;
+        i64 SumClockSkewUs = 0;
+    };
+
+    void CalculateClockSkewState(TDatabaseState& databaseState, const TVector<TNodeId>& computeNodes) {
+        std::unordered_map<TNodeId, TNodeClockSkewState> nodeClockSkewState;
+        for (TNodeId nodeId : computeNodes) {
+            auto itNodeSystemState = MergedNodeSystemState.find(nodeId);
+            if (itNodeSystemState != MergedNodeSystemState.end()) {
+                const NKikimrWhiteboard::TSystemStateInfo& nodeSystemState(*itNodeSystemState->second);
+                if (nodeSystemState.HasMaxClockSkewPeerId()) {
+                    TNodeId peerId = nodeSystemState.GetMaxClockSkewPeerId();
+                    i64 timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs();
+                    auto& clockSkewState = nodeClockSkewState[peerId];
+                    clockSkewState.NumberOfReporters++;
+                    clockSkewState.SumClockSkewUs += timeDifferenceUs;
+                    databaseState.MaxClockSkewUs = std::max<ui64>(databaseState.MaxClockSkewUs, abs(timeDifferenceUs));
+                }
+            }
+        }
+        auto itMaxClockSkew = std::ranges::max_element(nodeClockSkewState, [](const auto& a, const auto& b) {
+            return a.second.NumberOfReporters > b.second.NumberOfReporters;
+        });
+        if (itMaxClockSkew != nodeClockSkewState.end()) {
+            if (itMaxClockSkew->second.NumberOfReporters * 2 >= static_cast<int>(nodeClockSkewState.size())) { // at least 50% of reporters
+                databaseState.MaxClockSkewNodeId = itMaxClockSkew->first;
+                databaseState.MaxClockSkewNodeAvgUs = itMaxClockSkew->second.SumClockSkewUs / itMaxClockSkew->second.NumberOfReporters;
+            }
+        }
+    }
+
     void FillCompute(TDatabaseState& databaseState, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
         TVector<TNodeId>* computeNodeIds = &databaseState.ComputeNodeIds;
-        auto report = [](TSelfCheckContext& context, ETags tag) {
-            context.ReportWithMaxChildStatus("Some nodes are restarting too often", tag, {ETags::Uptime});
-            context.ReportWithMaxChildStatus("Compute is overloaded", tag, {ETags::OverloadState});
-            context.ReportWithMaxChildStatus("Compute quota usage", tag, {ETags::QuotaUsage});
-        };
         if (databaseState.ResourcePathId
             && databaseState.ServerlessComputeResourcesMode != NKikimrSubDomains::EServerlessComputeResourcesModeExclusive)
         {
@@ -2189,67 +2253,50 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
         }
         std::sort(computeNodeIds->begin(), computeNodeIds->end());
         computeNodeIds->erase(std::unique(computeNodeIds->begin(), computeNodeIds->end()), computeNodeIds->end());
-        bool bridgeMode = NodeWardenStorageConfig && NodeWardenStorageConfig->Get()->BridgeInfo;
+        bool bridgeMode = NodeWardenStorageConfig && NodeWardenStorageConfig->IsOk() && NodeWardenStorageConfig->Get()->BridgeInfo
+             && !NodeWardenStorageConfig->Get()->BridgeInfo->Piles.empty();
         if (computeNodeIds->empty()) {
             context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "There are no compute nodes", ETags::ComputeState);
         } else {
-            std::vector<TString> activePiles = {""};
-            std::unordered_map<TString, TSelfCheckContext> pileContext;
             if (bridgeMode) {
-                for (size_t i = 0; i < AppData()->BridgeConfig.PilesSize(); ++i) {
-                    const auto& pile = NodeWardenStorageConfig->Get()->BridgeInfo->Piles[i];
-                    const auto& pileName = AppData()->BridgeConfig.GetPiles(i).GetName();
-                    auto [it, _] = pileContext.try_emplace(pileName, &context, "BRIDGE_PILE");
-                    it->second.Location.mutable_compute()->mutable_pile()->set_name(pileName);
-                    if (pile.IsPrimary || pile.IsBeingPromoted) {
-                        activePiles.push_back(pileName);
+                const auto& piles(NodeWardenStorageConfig->Get()->BridgeInfo->Piles);
+                TVector<TNodeId> activePileNodeIds;
+                for (size_t pileNum = 0; pileNum < piles.size(); ++pileNum) {
+                    if (piles[pileNum].IsPrimary || piles[pileNum].IsBeingPromoted) {
+                        if (PileNumToNodeIds.size() <= pileNum) {
+                            continue;
+                        }
+                        const auto& pileNodes(PileNumToNodeIds[pileNum]);
+                        for (TNodeId nodeId : *computeNodeIds) {
+                            if (pileNodes.count(nodeId) > 0) {
+                                activePileNodeIds.push_back(nodeId);
+                            }
+                        }
                     }
                 }
-            }
-            auto getContext = [&](const TString& pileName) -> TSelfCheckContext* {
-                auto it = pileContext.find(pileName);
-                if (it == pileContext.end()) {
-                    return &context;
-                } else {
-                    return &it->second;
-                }
-            };
-            long maxTimeDifferenceUs = 0;
-            for (TNodeId nodeId : *computeNodeIds) {
-                auto itNodeSystemState = MergedNodeSystemState.find(nodeId);
-                if (itNodeSystemState != MergedNodeSystemState.end()) {
-                    const TString& pileName = itNodeSystemState->second->GetLocation().GetBridgePileName();
-                    if (std::count(activePiles.begin(), activePiles.end(), pileName) > 0
-                            && abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()) > maxTimeDifferenceUs) {
-                        maxTimeDifferenceUs = abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs());
-                        databaseState.MaxTimeDifferenceNodeId = nodeId;
-                    }
+                CalculateClockSkewState(databaseState, activePileNodeIds);
+                for (size_t pileNum = 0; pileNum < piles.size(); ++pileNum) {
+                    FillComputePileStatus(databaseState, pileNum, piles[pileNum].Name, *computeNodeIds, computeStatus, {&context, "BRIDGE_PILE"});
                 }
-            }
-            for (TNodeId nodeId : *computeNodeIds) {
-                auto itNodeSystemState = MergedNodeSystemState.find(nodeId);
-                TString pileName;
-                if (itNodeSystemState != MergedNodeSystemState.end()) {
-                    pileName = itNodeSystemState->second->GetLocation().GetBridgePileName();
+                context.ReportWithMaxChildStatus("There are compute issues", ETags::ComputeState, {ETags::PileComputeState});
+            } else {
+                CalculateClockSkewState(databaseState, *computeNodeIds);
+                for (TNodeId nodeId : *computeNodeIds) {
+                    auto& computeNode = *computeStatus.add_nodes();
+                    FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"});
                 }
-                auto& computeNode = *computeStatus.add_nodes();
-                FillComputeNodeStatus(databaseState, nodeId, computeNode, {getContext(pileName), "COMPUTE_NODE"});
-            }
-            for (auto& [_, ctx] : pileContext) {
-                report(ctx, ETags::PileComputeState);
+                context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::ComputeState, {ETags::Uptime});
+                context.ReportWithMaxChildStatus("Compute is overloaded", ETags::ComputeState, {ETags::OverloadState});
+                context.ReportWithMaxChildStatus("Compute quota usage", ETags::ComputeState, {ETags::QuotaUsage});
+                context.ReportWithMaxChildStatus("Clock skew issues", ETags::ComputeState, {ETags::NodeClockSkew});
             }
         }
         Ydb::Monitoring::StatusFlag::Status systemStatus = FillSystemTablets(databaseState, {&context, "SYSTEM_TABLET"});
         if (systemStatus != Ydb::Monitoring::StatusFlag::GREEN && systemStatus != Ydb::Monitoring::StatusFlag::GREY) {
             context.ReportStatus(systemStatus, "Compute has issues with system tablets", ETags::ComputeState, {ETags::SystemTabletState});
         }
-        FillComputeDatabaseStatus(databaseState, computeStatus, {&context, "COMPUTE_QUOTA"});
-        if (bridgeMode) {
-            context.ReportWithMaxChildStatus("There are compute issues", ETags::ComputeState, {ETags::PileComputeState});
-        } else {
-            report(context, ETags::ComputeState);
-        }
-        context.ReportWithMaxChildStatus("Database has time difference between nodes", ETags::ComputeState, {ETags::SyncState});
+        FillComputeDatabaseQuota(databaseState, computeStatus, {&context, "COMPUTE_QUOTA"});
+        FillComputeDatabaseClockSkew(databaseState, computeStatus, {&context, "CLOCK_SKEW"});
         Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN;
         computeNodeIds->push_back(0); // for tablets without node
         for (TNodeId nodeId : *computeNodeIds) {