Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 46 additions & 4 deletions cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ const (
nodepoolTags = "nodepoolTags"
min = "min"
max = "max"
minSize = "minSize"
maxSize = "maxSize"
)

var (
Expand Down Expand Up @@ -90,6 +92,9 @@ func CreateNodePoolManager(cloudConfigPath string, nodeGroupAutoDiscoveryList []
var err error
var configProvider common.ConfigurationProvider

// enable SDK to look up the IMDS endpoint to figure out the right realmDomain
common.EnableInstanceMetadataServiceLookup()

if os.Getenv(ipconsts.OciUseWorkloadIdentityEnvVar) == "true" {
klog.Info("using workload identity provider")
configProvider, err = auth.OkeWorkloadIdentityConfigurationProvider()
Expand Down Expand Up @@ -214,21 +219,34 @@ func autoDiscoverNodeGroups(m *ociManagerImpl, okeClient okeClient, nodeGroup no
if validateNodepoolTags(nodeGroup.tags, nodePoolSummary.FreeformTags, nodePoolSummary.DefinedTags) {
nodepool := &nodePool{}
nodepool.id = *nodePoolSummary.Id
nodepool.minSize = nodeGroup.minSize
nodepool.maxSize = nodeGroup.maxSize
// set minSize-maxSize from nodepool free form tags, or else use nodeGroupAutoDiscovery configuration
nodepool.minSize = getIntFromMap(nodePoolSummary.FreeformTags, minSize, nodeGroup.minSize)
nodepool.maxSize = getIntFromMap(nodePoolSummary.FreeformTags, maxSize, nodeGroup.maxSize)

nodepool.manager = nodeGroup.manager
nodepool.kubeClient = nodeGroup.kubeClient

m.staticNodePools[nodepool.id] = nodepool
klog.V(5).Infof("auto discovered nodepool in compartment : %s , nodepoolid: %s", nodeGroup.compartmentId, nodepool.id)
klog.V(4).Infof("auto discovered nodepool in compartment : %s , nodepoolid: %s ,minSize: %d, maxSize:%d", nodeGroup.compartmentId, nodepool.id, nodepool.minSize, nodepool.maxSize)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is an important log, should we set this to default verbosity rather than 4?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree the log is important, the only thing holding me back is that refresh cache mechanism is triggered too often and this log will become very noisy if we enable it by default.
@trungng92 Could you provide your feedback on this ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should have a log this with default verbosity inside the the CreateNodePoolManager function after line 175, this way it isn't called after every referesh.
Something similar is done for the 'nodes' argument in line 284.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And the log line within this function can remain with a lower verbosity.

Copy link
Contributor Author

@gvnc gvnc Sep 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something similar is done for the 'nodes' argument in line 284.

We already have an equivalent log for nodeGroupAutoDiscovery.
https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager.go#L343

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we move the log line to CreateNodepoolManager function as you suggested, it will be only printed once when the CA pod gets started, but it wouldn't show the latest list of nodepools. When customer updates the nodepool tags or adds a new nodepool that satisfies the discovery rules, we wouldn't see the updates.

Copy link
Contributor

@trungng92 trungng92 Sep 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am okay with having it log in the refresh loop as long as we are only logging newly discovered node pools (e.g. if we discover node pool 1 on iteration 1, and node pool 2 on iteration 2, only log node pool 2).

It would be nice to avoid logging all auto discovered node pools every time (although technically it may be useful in the edge case where the user removes an auto discovered tag from a node pool).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I modified the code and added comparison logic to log the updates. It makes a copy of already discovered nodepools and runs a fresh auto-discovery, then it compares 2 lists and logs if there are any New/Updated/Deleted nodepools. Otherwise, it logs nothing.
I have also added a test method to validate the changes. See the logs from the test output below.

=== RUN   TestNodeGroupAutoDiscovery
I0911 18:23:19.562355   71028 oci_manager.go:361] node group auto discovery spec constructed: &{manager:<nil> kubeClient:<nil> clusterId:ocid1.cluster.oc1.test-region.test compartmentId:ocid1.compartment.oc1.test-region.test tags:map[ca-managed:true namespace.foo:bar] minSize:1 maxSize:5}
I0911 18:23:19.564702   71028 oci_manager.go:434] Nodepool min/max sizes are updated. [id: node-pool-2 ,minSize: 4, maxSize:10]
I0911 18:23:19.564710   71028 oci_manager.go:432] New nodepool discovered. [id: node-pool-1 ,minSize: 1, maxSize:5]
I0911 18:23:19.564714   71028 oci_manager.go:441] Previously auto-discovered nodepool removed from the managed nodepool list. nodepoolid: node-pool-3
I0911 18:23:19.564717   71028 cache.go:44] rebuilding cache
I0911 18:23:19.564721   71028 oci_manager.go:452] Refreshed NodePool list, next refresh after 2025-09-11 18:28:19.56472 +0100 BST m=+300.032385626
I0911 18:23:19.564759   71028 cache.go:44] rebuilding cache
I0911 18:23:19.564761   71028 oci_manager.go:452] Refreshed NodePool list, next refresh after 2025-09-11 18:28:19.564761 +0100 BST m=+300.032426626
--- PASS: TestNodeGroupAutoDiscovery (0.00s)
PASS

} else {
klog.Warningf("nodepool ignored as the tags do not satisfy the requirement : %s , %v, %v", *nodePoolSummary.Id, nodePoolSummary.FreeformTags, nodePoolSummary.DefinedTags)
}
}
return true, nil
}

func getIntFromMap(m map[string]string, key string, defaultValue int) int {
value, ok := m[key]
if !ok {
return defaultValue
}
i, err := strconv.Atoi(value)
if err != nil {
return defaultValue
}
return i
}

func validateNodepoolTags(nodeGroupTags map[string]string, freeFormTags map[string]string, definedTags map[string]map[string]interface{}) bool {
if nodeGroupTags != nil {
for tagKey, tagValue := range nodeGroupTags {
Expand Down Expand Up @@ -394,11 +412,35 @@ func (m *ociManagerImpl) TaintToPreventFurtherSchedulingOnRestart(nodes []*apiv1
func (m *ociManagerImpl) forceRefresh() error {
// auto discover node groups
if m.nodeGroups != nil {
// empty previous nodepool map to do an auto discovery
// create a copy of m.staticNodePools to use it in comparison
staticNodePoolsCopy := make(map[string]NodePool)
for k, v := range m.staticNodePools {
staticNodePoolsCopy[k] = v
}

// empty previous nodepool map to do a fresh auto discovery
m.staticNodePools = make(map[string]NodePool)

// run auto-discovery
for _, nodeGroup := range m.nodeGroups {
autoDiscoverNodeGroups(m, m.okeClient, nodeGroup)
}

// compare the new and previous nodepool list to log the updates
for nodepoolId, nodepool := range m.staticNodePools {
if _, ok := staticNodePoolsCopy[nodepoolId]; !ok {
klog.Infof("New nodepool discovered. [id: %s ,minSize: %d, maxSize:%d]", nodepool.Id(), nodepool.MinSize(), nodepool.MaxSize())
} else if staticNodePoolsCopy[nodepoolId].MinSize() != nodepool.MinSize() || staticNodePoolsCopy[nodepoolId].MaxSize() != nodepool.MaxSize() {
klog.Infof("Nodepool min/max sizes are updated. [id: %s ,minSize: %d, maxSize:%d]", nodepool.Id(), nodepool.MinSize(), nodepool.MaxSize())
}
}

// log if there are nodepools removed from the list
for k := range staticNodePoolsCopy {
if _, ok := m.staticNodePools[k]; !ok {
klog.Infof("Previously auto-discovered nodepool removed from the managed nodepool list. nodepoolid: %s", k)
}
}
}
// rebuild nodepool cache
err := m.nodePoolCache.rebuild(m.staticNodePools, maxGetNodepoolRetries)
Expand Down
86 changes: 82 additions & 4 deletions cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ package nodepools

import (
"context"
"fmt"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/nodepools/consts"
"net/http"
"reflect"
"testing"
"time"

apiv1 "k8s.io/api/core/v1"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
Expand All @@ -20,6 +22,10 @@ import (
oke "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/vendor-internal/github.com/oracle/oci-go-sdk/v65/containerengine"
)

const (
autoDiscoveryCompartment = "ocid1.compartment.oc1.test-region.test"
)

func TestNodePoolFromArgs(t *testing.T) {
value := `1:5:ocid`
nodePool, err := nodePoolFromArg(value)
Expand Down Expand Up @@ -321,8 +327,15 @@ func TestBuildGenericLabels(t *testing.T) {

type mockOKEClient struct{}

func (c mockOKEClient) GetNodePool(context.Context, oke.GetNodePoolRequest) (oke.GetNodePoolResponse, error) {
return oke.GetNodePoolResponse{}, nil
func (c mockOKEClient) GetNodePool(ctx context.Context, req oke.GetNodePoolRequest) (oke.GetNodePoolResponse, error) {
return oke.GetNodePoolResponse{
NodePool: oke.NodePool{
Id: req.NodePoolId,
NodeConfigDetails: &oke.NodePoolNodeConfigDetails{
Size: common.Int(1),
},
},
}, nil
}
func (c mockOKEClient) UpdateNodePool(context.Context, oke.UpdateNodePoolRequest) (oke.UpdateNodePoolResponse, error) {
return oke.UpdateNodePoolResponse{}, nil
Expand All @@ -336,7 +349,39 @@ func (c mockOKEClient) DeleteNode(context.Context, oke.DeleteNodeRequest) (oke.D
}, nil
}

func (c mockOKEClient) ListNodePools(context.Context, oke.ListNodePoolsRequest) (oke.ListNodePoolsResponse, error) {
func (c mockOKEClient) ListNodePools(ctx context.Context, req oke.ListNodePoolsRequest) (oke.ListNodePoolsResponse, error) {
// below test data added for auto-discovery tests
if req.CompartmentId != nil && *req.CompartmentId == autoDiscoveryCompartment {
freeformTags1 := map[string]string{
"ca-managed": "true",
}
freeformTags2 := map[string]string{
"ca-managed": "true",
"minSize": "4",
"maxSize": "10",
}
definedTags := map[string]map[string]interface{}{
"namespace": {
"foo": "bar",
},
}
resp := oke.ListNodePoolsResponse{
Items: []oke.NodePoolSummary{
{
Id: common.String("node-pool-1"),
FreeformTags: freeformTags1,
DefinedTags: definedTags,
},
{
Id: common.String("node-pool-2"),
FreeformTags: freeformTags2,
DefinedTags: definedTags,
},
},
}
return resp, nil
}

return oke.ListNodePoolsResponse{}, nil
}

Expand Down Expand Up @@ -393,8 +438,41 @@ func TestRemoveInstance(t *testing.T) {
}
}

func TestNodeGroupAutoDiscovery(t *testing.T) {
var nodeGroupArg = fmt.Sprintf("clusterId:ocid1.cluster.oc1.test-region.test,compartmentId:%s,nodepoolTags:ca-managed=true&namespace.foo=bar,min:1,max:5", autoDiscoveryCompartment)
nodeGroup, err := nodeGroupFromArg(nodeGroupArg)
if err != nil {
t.Errorf("Error: #{err}")
}
nodePoolCache := newNodePoolCache(nil)
nodePoolCache.okeClient = mockOKEClient{}

cloudConfig := &ocicommon.CloudConfig{}
cloudConfig.Global.RefreshInterval = 5 * time.Minute
cloudConfig.Global.CompartmentID = autoDiscoveryCompartment

manager := &ociManagerImpl{
nodePoolCache: nodePoolCache,
nodeGroups: []nodeGroupAutoDiscovery{*nodeGroup},
okeClient: mockOKEClient{},
cfg: cloudConfig,
staticNodePools: map[string]NodePool{},
}
// test data to use as initial nodepools
nodepool2 := &nodePool{
id: "node-pool-2", minSize: 1, maxSize: 5,
}
manager.staticNodePools[nodepool2.id] = nodepool2
nodepool3 := &nodePool{
id: "node-pool-3", minSize: 2, maxSize: 5,
}
manager.staticNodePools[nodepool3.id] = nodepool3

manager.forceRefresh()
}

func TestNodeGroupFromArg(t *testing.T) {
var nodeGroupArg = "clusterId:ocid1.cluster.oc1.test-region.test,compartmentId:ocid1.compartment.oc1.test-region.test,nodepoolTags:ca-managed=true&namespace.foo=bar,min:1,max:5"
var nodeGroupArg = fmt.Sprintf("clusterId:ocid1.cluster.oc1.test-region.test,compartmentId:%s,nodepoolTags:ca-managed=true&namespace.foo=bar,min:1,max:5", autoDiscoveryCompartment)
nodeGroupAutoDiscovery, err := nodeGroupFromArg(nodeGroupArg)
if err != nil {
t.Errorf("Error: #{err}")
Expand Down
Loading