Skip to content

Commit a81054f

Browse files
Fix gpcheckresgroupv2impl to use dynamic cgroup parent from GUC parameter
This commit fixes issues introduced in "Add guc: gp_resource_group_cgroup_parent (#16738)" where the gp_resource_group_cgroup_parent GUC parameter was added but the gpcheckresgroupv2impl script still used hardcoded "gpdb" paths. Changes: - Implement get_cgroup_parent() method to dynamically retrieve the gp_resource_group_cgroup_parent value from database - Replace all hardcoded "gpdb" paths with dynamic cgroup parent value - Improve error handling in cgroup.c with more descriptive error messages - Fix test configuration order: set gp_resource_group_cgroup_parent before enabling gp_resource_manager=group-v2 to avoid validation failures This ensures the cgroup validation script works correctly with custom cgroup parent directories configured via the GUC parameter, making the resource group feature more flexible for different deployment scenarios.
1 parent 6d52fe3 commit a81054f

File tree

5 files changed

+95
-16
lines changed

5 files changed

+95
-16
lines changed

gpMgmt/bin/gpcheckresgroupv2impl

Lines changed: 46 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,17 @@
22
# Copyright (c) 2017, VMware, Inc. or its affiliates.
33

44
import os
5+
import sys
56
from functools import reduce
67

8+
# Add the gppylib path to sys.path to import database connection modules
9+
try:
10+
from gppylib.db import dbconn
11+
from pg import DatabaseError
12+
except ImportError as err:
13+
sys.exit('Cannot import modules. Please check that you have sourced '
14+
'cloudberry-env.sh. Detail: ' + str(err))
15+
716

817
class ValidationException(Exception):
918
def __init__(self, message):
@@ -29,6 +38,7 @@ class CgroupValidationVersionTwo(CgroupValidation):
2938
def __init__(self):
3039
self.mount_point = self.detect_cgroup_mount_point()
3140
self.tab = {"r": os.R_OK, "w": os.W_OK, "x": os.X_OK, "f": os.F_OK}
41+
self.cgroup_parent = self.get_cgroup_parent()
3242

3343
def validate_all(self):
3444
"""
@@ -43,23 +53,46 @@ class CgroupValidationVersionTwo(CgroupValidation):
4353

4454
self.validate_permission("cgroup.procs", "rw")
4555

46-
self.validate_permission("gpdb/", "rwx")
47-
self.validate_permission("gpdb/cgroup.procs", "rw")
56+
self.validate_permission(self.cgroup_parent + "/", "rwx")
57+
self.validate_permission(self.cgroup_parent + "/cgroup.procs", "rw")
58+
59+
self.validate_permission(self.cgroup_parent + "/cpu.max", "rw")
60+
self.validate_permission(self.cgroup_parent + "/cpu.weight", "rw")
61+
self.validate_permission(self.cgroup_parent + "/cpu.weight.nice", "rw")
62+
self.validate_permission(self.cgroup_parent + "/cpu.stat", "r")
4863

49-
self.validate_permission("gpdb/cpu.max", "rw")
50-
self.validate_permission("gpdb/cpu.weight", "rw")
51-
self.validate_permission("gpdb/cpu.weight.nice", "rw")
52-
self.validate_permission("gpdb/cpu.stat", "r")
64+
self.validate_permission(self.cgroup_parent + "/cpuset.cpus", "rw")
65+
self.validate_permission(self.cgroup_parent + "/cpuset.cpus.partition", "rw")
66+
self.validate_permission(self.cgroup_parent + "/cpuset.mems", "rw")
67+
self.validate_permission(self.cgroup_parent + "/cpuset.cpus.effective", "r")
68+
self.validate_permission(self.cgroup_parent + "/cpuset.mems.effective", "r")
5369

54-
self.validate_permission("gpdb/cpuset.cpus", "rw")
55-
self.validate_permission("gpdb/cpuset.cpus.partition", "rw")
56-
self.validate_permission("gpdb/cpuset.mems", "rw")
57-
self.validate_permission("gpdb/cpuset.cpus.effective", "r")
58-
self.validate_permission("gpdb/cpuset.mems.effective", "r")
70+
self.validate_permission(self.cgroup_parent + "/memory.current", "r")
5971

60-
self.validate_permission("gpdb/memory.current", "r")
72+
self.validate_permission(self.cgroup_parent + "/io.max", "rw")
6173

62-
self.validate_permission("gpdb/io.max", "rw")
74+
def get_cgroup_parent(self):
75+
"""
76+
Get the cgroup parent directory from the database GUC parameter
77+
gp_resource_group_cgroup_parent. If unable to connect to database
78+
or retrieve the parameter, report error using die function.
79+
"""
80+
try:
81+
dburl = dbconn.DbURL()
82+
83+
with dbconn.connect(dburl, utility=True) as conn:
84+
# Query the GUC parameter value
85+
sql = "SHOW gp_resource_group_cgroup_parent"
86+
cursor = dbconn.query(conn, sql)
87+
result = cursor.fetchone()
88+
89+
if result and result[0]:
90+
return result[0]
91+
else:
92+
self.die("failed to retrieve gp_resource_group_cgroup_parent parameter from database")
93+
94+
except Exception as e:
95+
self.die("failed to retrieve gp_resource_group_cgroup_parent parameter: {}".format(str(e)))
6396

6497
def die(self, msg):
6598
raise ValidationException("cgroup is not properly configured: {}".format(msg))

src/backend/utils/resgroup/cgroup.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -665,6 +665,12 @@ permListCheck(const PermList *permlist, Oid group, bool report)
665665
prop[0] ? "file" : "directory",
666666
path);
667667
}
668+
else
669+
{
670+
CGROUP_CONFIG_WARNING("invalid %s name '%s': %m",
671+
prop[0] ? "file" : "directory",
672+
path);
673+
}
668674
return false;
669675
}
670676

@@ -678,6 +684,12 @@ permListCheck(const PermList *permlist, Oid group, bool report)
678684
prop[0] ? "file" : "directory",
679685
path);
680686
}
687+
else
688+
{
689+
CGROUP_CONFIG_WARNING("can't access %s '%s': %m",
690+
prop[0] ? "file" : "directory",
691+
path);
692+
}
681693
return false;
682694
}
683695
}

src/include/utils/cgroup.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,18 @@
1818
#include "postgres.h"
1919
#include "nodes/pg_list.h"
2020

21+
extern char *gp_resource_group_cgroup_parent;
22+
2123
#define MAX_CGROUP_PATHLEN 256
2224
#define MAX_CGROUP_CONTENTLEN 1024
2325

26+
#define CGROUP_CONFIG_WARNING(msg, ...) \
27+
ereport(WARNING, \
28+
(errmsg("cgroup is not properly configured: " msg, ##__VA_ARGS__), \
29+
errhint("Please check whether the directory '/sys/fs/cgroup/%s' exists when gp_resource_manager = 'group-v2' and gp_resource_group_cgroup_parent = '%s'.", \
30+
gp_resource_group_cgroup_parent ? gp_resource_group_cgroup_parent : "", \
31+
gp_resource_group_cgroup_parent ? gp_resource_group_cgroup_parent : "")))
32+
2433
#define CGROUP_ERROR(...) elog(ERROR, __VA_ARGS__)
2534
#define CGROUP_CONFIG_ERROR(...) \
2635
CGROUP_ERROR("cgroup is not properly configured: " __VA_ARGS__)

src/test/isolation2/expected/resgroup/resgroup_auxiliary_tools_v2.out

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,20 @@ ERROR: language "plpython3u" already exists
2020
-- end_ignore
2121

2222
-- enable resource group and restart cluster.
23+
-- prerequisites:
24+
-- 1. '/sys/fs/cgroup/gpdb' must exist,
25+
-- otherwise create it before run installcheck-resgroup-v2;
26+
-- 2. 'gpconfig -c gp_resource_group_cgroup_parent -v "gpdb" && gpstop -rai'
27+
-- must run before 'gpconfig -c gp_resource_manager -v group-v2', because
28+
-- during the process of setting gp_resource_manager to group-v2, the
29+
-- system will check whether the directory
30+
-- '/sys/fs/cgroup/$gp_resource_group_cgroup_parent' exists.
2331
-- start_ignore
24-
! gpconfig -c gp_resource_manager -v group-v2;
32+
! gpconfig -c gp_resource_group_cgroup_parent -v "gpdb";
2533

26-
! gpconfig -c gp_resource_group_cgroup_parent -v "gpdb"
34+
! gpstop -rai;
35+
36+
! gpconfig -c gp_resource_manager -v group-v2;
2737

2838
! gpconfig -c max_connections -v 250 -m 25;
2939

@@ -39,6 +49,11 @@ ERROR: language "plpython3u" already exists
3949
---------------------
4050
group-v2
4151
(1 row)
52+
0: SHOW gp_resource_group_cgroup_parent;
53+
gp_resource_group_cgroup_parent
54+
---------------------------------
55+
gpdb
56+
(1 row)
4257

4358
-- resource queue statistics should not crash
4459
0: SELECT * FROM pg_resqueue_status;

src/test/isolation2/sql/resgroup/resgroup_auxiliary_tools_v2.sql

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,18 @@ CREATE LANGUAGE plpython3u;
1919
-- end_ignore
2020

2121
-- enable resource group and restart cluster.
22+
-- prerequisites:
23+
-- 1. '/sys/fs/cgroup/gpdb' must exist,
24+
-- otherwise create it before run installcheck-resgroup-v2;
25+
-- 2. 'gpconfig -c gp_resource_group_cgroup_parent -v "gpdb" && gpstop -rai'
26+
-- must run before 'gpconfig -c gp_resource_manager -v group-v2', because
27+
-- during the process of setting gp_resource_manager to group-v2, the
28+
-- system will check whether the directory
29+
-- '/sys/fs/cgroup/$gp_resource_group_cgroup_parent' exists.
2230
-- start_ignore
31+
! gpconfig -c gp_resource_group_cgroup_parent -v "gpdb";
32+
! gpstop -rai;
2333
! gpconfig -c gp_resource_manager -v group-v2;
24-
! gpconfig -c gp_resource_group_cgroup_parent -v "gpdb"
2534
! gpconfig -c max_connections -v 250 -m 25;
2635
! gpconfig -c runaway_detector_activation_percent -v 100;
2736
! gpstop -rai;
@@ -30,6 +39,7 @@ CREATE LANGUAGE plpython3u;
3039
-- after the restart we need a new connection to run the queries
3140

3241
0: SHOW gp_resource_manager;
42+
0: SHOW gp_resource_group_cgroup_parent;
3343

3444
-- resource queue statistics should not crash
3545
0: SELECT * FROM pg_resqueue_status;

0 commit comments

Comments
 (0)