Skip to content

Commit 9f38d93

Browse files
Gaudi: add CI (#3160)
Co-authored-by: Pauline Bailly-Masson <155966238+paulinebm@users.noreply.github.com>
1 parent 7199074 commit 9f38d93

File tree

9 files changed

+160
-75
lines changed

9 files changed

+160
-75
lines changed

.github/workflows/build.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,9 +129,9 @@ jobs:
129129
export label_extension="-gaudi"
130130
export docker_volume="/mnt/cache"
131131
export docker_devices=""
132-
export runs_on="ubuntu-latest"
132+
export runs_on="itac-bm-emr-gaudi3-dell-2gaudi"
133133
export platform=""
134-
export extra_pytest=""
134+
export extra_pytest="--gaudi"
135135
export target=""
136136
esac
137137
echo $dockerfile

backends/gaudi/Makefile

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,14 @@ local-dev-install: install-dependencies
5050

5151
# In order to run the integration tests, you need to first build the image (make -C backends/gaudi image)
5252
run-integration-tests:
53-
pip install -U pip uv
54-
uv pip install -r ${root_dir}/backends/gaudi/server/integration-tests/requirements.txt
5553
DOCKER_VOLUME=${root_dir}/data \
5654
HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
57-
uv run pytest --durations=0 -sv ${root_dir}/backends/gaudi/server/integration-tests
55+
pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi
56+
57+
run-integration-tests-with-all-models:
58+
DOCKER_VOLUME=${root_dir}/data \
59+
HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
60+
pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi --gaudi-all-models
5861

5962
# This is used to capture the expected outputs for the integration tests offering an easy way to add more models to the integration tests
6063
capture-expected-outputs-for-integration-tests:

backends/gaudi/README.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,16 +99,26 @@ curl 127.0.0.1:8080/generate \
9999

100100
### Integration tests
101101

102+
Install the dependencies:
103+
```bash
104+
pip install -r integration-tests/requirements.txt
105+
```
106+
102107
To run the integration tests, you need to first build the image:
103108
```bash
104109
make -C backends/gaudi image
105110
```
106111

107-
Then run the following command to run the integration tests:
112+
Then run the following command to run the integration tests (CI tests):
108113
```bash
109114
make -C backends/gaudi run-integration-tests
110115
```
111116

117+
To run the integration tests with all models, you can run the following command:
118+
```bash
119+
make -C backends/gaudi run-integration-tests-with-all-models
120+
```
121+
112122
To capture the expected outputs for the integration tests, you can run the following command:
113123
```bash
114124
make -C backends/gaudi capture-expected-outputs-for-integration-tests

backends/gaudi/server/integration-tests/pytest.ini

Lines changed: 0 additions & 2 deletions
This file was deleted.

backends/gaudi/server/integration-tests/requirements.txt

Lines changed: 0 additions & 7 deletions
This file was deleted.

integration-tests/conftest.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.export_models"]
1+
pytest_plugins = [
2+
"fixtures.neuron.service",
3+
"fixtures.neuron.export_models",
4+
"fixtures.gaudi.service",
5+
]
26
# ruff: noqa: E402
37
from _pytest.fixtures import SubRequest
48
from huggingface_hub.inference._generated.types.chat_completion import (
@@ -68,6 +72,15 @@ def pytest_addoption(parser):
6872
parser.addoption(
6973
"--neuron", action="store_true", default=False, help="run neuron tests"
7074
)
75+
parser.addoption(
76+
"--gaudi", action="store_true", default=False, help="run gaudi tests"
77+
)
78+
parser.addoption(
79+
"--gaudi-all-models",
80+
action="store_true",
81+
default=False,
82+
help="Run tests for all models instead of just the default subset",
83+
)
7184

7285

7386
def pytest_configure(config):
@@ -84,6 +97,22 @@ def skip_release(item):
8497
item.add_marker(pytest.mark.skip(reason="need --release option to run"))
8598

8699
selectors.append(skip_release)
100+
101+
if config.getoption("--gaudi"):
102+
103+
def skip_not_gaudi(item):
104+
if "gaudi" not in item.keywords:
105+
item.add_marker(pytest.mark.skip(reason="requires --gaudi to run"))
106+
107+
selectors.append(skip_not_gaudi)
108+
else:
109+
110+
def skip_gaudi(item):
111+
if "gaudi" in item.keywords:
112+
item.add_marker(pytest.mark.skip(reason="requires --gaudi to run"))
113+
114+
selectors.append(skip_gaudi)
115+
87116
if config.getoption("--neuron"):
88117

89118
def skip_not_neuron(item):
@@ -100,6 +129,7 @@ def skip_neuron(item):
100129
item.add_marker(pytest.mark.skip(reason="requires --neuron to run"))
101130

102131
selectors.append(skip_neuron)
132+
103133
for item in items:
104134
for selector in selectors:
105135
selector(item)

backends/gaudi/server/integration-tests/conftest.py renamed to integration-tests/fixtures/gaudi/service.py

Lines changed: 49 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,21 @@
1414
import pytest
1515
from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
1616
from docker.errors import NotFound
17-
from loguru import logger
18-
from test_model import TEST_CONFIGS
19-
from text_generation import AsyncClient
20-
from text_generation.types import Response
17+
import logging
18+
from huggingface_hub import AsyncInferenceClient, TextGenerationOutput
19+
import huggingface_hub
20+
21+
logging.basicConfig(
22+
level=logging.INFO,
23+
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
24+
stream=sys.stdout,
25+
)
26+
logger = logging.getLogger(__file__)
2127

2228
# Use the latest image from the local docker build
2329
DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "tgi-gaudi")
2430
DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", None)
25-
HF_TOKEN = os.getenv("HF_TOKEN", None)
31+
HF_TOKEN = huggingface_hub.get_token()
2632

2733
assert (
2834
HF_TOKEN is not None
@@ -48,12 +54,6 @@
4854
"cap_add": ["sys_nice"],
4955
}
5056

51-
logger.add(
52-
sys.stderr,
53-
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
54-
level="INFO",
55-
)
56-
5757

5858
def stream_container_logs(container, test_name):
5959
"""Stream container logs in a separate thread."""
@@ -69,9 +69,15 @@ def stream_container_logs(container, test_name):
6969
logger.error(f"Error streaming container logs: {str(e)}")
7070

7171

72+
class TestClient(AsyncInferenceClient):
73+
def __init__(self, service_name: str, base_url: str):
74+
super().__init__(model=base_url)
75+
self.service_name = service_name
76+
77+
7278
class LauncherHandle:
73-
def __init__(self, port: int):
74-
self.client = AsyncClient(f"http://localhost:{port}", timeout=3600)
79+
def __init__(self, service_name: str, port: int):
80+
self.client = TestClient(service_name, f"http://localhost:{port}")
7581

7682
def _inner_health(self):
7783
raise NotImplementedError
@@ -87,7 +93,7 @@ async def health(self, timeout: int = 60):
8793
raise RuntimeError("Launcher crashed")
8894

8995
try:
90-
await self.client.generate("test")
96+
await self.client.text_generation("test", max_new_tokens=1)
9197
elapsed = time.time() - start_time
9298
logger.info(f"Health check passed after {elapsed:.1f}s")
9399
return
@@ -111,7 +117,8 @@ async def health(self, timeout: int = 60):
111117

112118
class ContainerLauncherHandle(LauncherHandle):
113119
def __init__(self, docker_client, container_name, port: int):
114-
super(ContainerLauncherHandle, self).__init__(port)
120+
service_name = container_name # Use container name as service name
121+
super(ContainerLauncherHandle, self).__init__(service_name, port)
115122
self.docker_client = docker_client
116123
self.container_name = container_name
117124

@@ -132,7 +139,8 @@ def _inner_health(self) -> bool:
132139

133140
class ProcessLauncherHandle(LauncherHandle):
134141
def __init__(self, process, port: int):
135-
super(ProcessLauncherHandle, self).__init__(port)
142+
service_name = "process" # Use generic name for process launcher
143+
super(ProcessLauncherHandle, self).__init__(service_name, port)
136144
self.process = process
137145

138146
def _inner_health(self) -> bool:
@@ -151,11 +159,13 @@ def data_volume():
151159

152160

153161
@pytest.fixture(scope="module")
154-
def launcher(data_volume):
162+
def gaudi_launcher():
155163
@contextlib.contextmanager
156164
def docker_launcher(
157165
model_id: str,
158166
test_name: str,
167+
tgi_args: List[str] = None,
168+
env_config: dict = None,
159169
):
160170
logger.info(
161171
f"Starting docker launcher for model {model_id} and test {test_name}"
@@ -183,32 +193,40 @@ def get_free_port():
183193
)
184194
container.stop()
185195
container.wait()
196+
container.remove()
197+
logger.info(f"Removed existing container {container_name}")
186198
except NotFound:
187199
pass
188200
except Exception as e:
189201
logger.error(f"Error handling existing container: {str(e)}")
190202

191-
model_name = next(
192-
name for name, cfg in TEST_CONFIGS.items() if cfg["model_id"] == model_id
193-
)
194-
195-
tgi_args = TEST_CONFIGS[model_name]["args"].copy()
203+
if tgi_args is None:
204+
tgi_args = []
205+
else:
206+
tgi_args = tgi_args.copy()
196207

197208
env = BASE_ENV.copy()
198209

199210
# Add model_id to env
200211
env["MODEL_ID"] = model_id
201212

202-
# Add env config that is definied in the fixture parameter
203-
if "env_config" in TEST_CONFIGS[model_name]:
204-
env.update(TEST_CONFIGS[model_name]["env_config"].copy())
213+
# Add env config that is defined in the fixture parameter
214+
if env_config is not None:
215+
env.update(env_config.copy())
205216

206-
volumes = [f"{DOCKER_VOLUME}:/data"]
217+
volumes = []
218+
if DOCKER_VOLUME:
219+
volumes = [f"{DOCKER_VOLUME}:/data"]
207220
logger.debug(f"Using volume {volumes}")
208221

209222
try:
223+
logger.debug(f"Using command {tgi_args}")
210224
logger.info(f"Creating container with name {container_name}")
211225

226+
logger.debug(f"Using environment {env}")
227+
logger.debug(f"Using volumes {volumes}")
228+
logger.debug(f"HABANA_RUN_ARGS {HABANA_RUN_ARGS}")
229+
212230
# Log equivalent docker run command for debugging, this is not actually executed
213231
container = client.containers.run(
214232
DOCKER_IMAGE,
@@ -271,15 +289,16 @@ def get_free_port():
271289

272290

273291
@pytest.fixture(scope="module")
274-
def generate_load():
292+
def gaudi_generate_load():
275293
async def generate_load_inner(
276-
client: AsyncClient, prompt: str, max_new_tokens: int, n: int
277-
) -> List[Response]:
294+
client: AsyncInferenceClient, prompt: str, max_new_tokens: int, n: int
295+
) -> List[TextGenerationOutput]:
278296
try:
279297
futures = [
280-
client.generate(
298+
client.text_generation(
281299
prompt,
282300
max_new_tokens=max_new_tokens,
301+
details=True,
283302
decoder_input_details=True,
284303
)
285304
for _ in range(n)

backends/gaudi/server/integration-tests/capture_expected_outputs.py renamed to integration-tests/gaudi/capture_expected_outputs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from typing import Dict, Any, Generator
44

55
import pytest
6-
from test_model import TEST_CONFIGS
6+
from test_gaudi_generate import TEST_CONFIGS
77

88
UNKNOWN_CONFIGS = {
99
name: config

0 commit comments

Comments
 (0)