Skip to content

Commit 8801ba1

Browse files
tengomuchodacorvo
andauthored
Optimum neuron 0.3.0 (#3308)
* chore(neuron): update to optimum-neuron 0.3.0 Dependencies were changed accordingly, because Neuron SDK was updated to v2.24. * test: sample is not deterministic Also modify the temperature in decode test to avoid granite early stopping. * test(neuron): adjust expectations after graph changes * test(neuron): use greedy for stop sequences --------- Co-authored-by: David Corvoysier <david@huggingface.co>
1 parent d618424 commit 8801ba1

File tree

4 files changed

+48
-61
lines changed

4 files changed

+48
-61
lines changed

Dockerfile.neuron

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ RUN mkdir -p /tgi
55
# Fetch the optimum-neuron sources directly to avoid relying on pypi deployments
66
FROM alpine AS optimum-neuron
77
RUN mkdir -p /optimum-neuron
8-
ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.2.2.tar.gz /optimum-neuron/sources.tar.gz
8+
ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.3.0.tar.gz /optimum-neuron/sources.tar.gz
99
RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1
1010

1111
# Build cargo components (adapted from TGI original Dockerfile)
@@ -108,10 +108,10 @@ RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEU
108108
# Install neuronx packages
109109
RUN apt-get update -y \
110110
&& apt-get install -y --no-install-recommends \
111-
aws-neuronx-dkms=2.20.28.0 \
112-
aws-neuronx-collectives=2.24.59.0-838c7fc8b \
113-
aws-neuronx-runtime-lib=2.24.53.0-f239092cc \
114-
aws-neuronx-tools=2.22.61.0 \
111+
aws-neuronx-dkms=2.22.2.0 \
112+
aws-neuronx-collectives=2.26.43.0-47cc904ea \
113+
aws-neuronx-runtime-lib=2.26.42.0-2ff3b5c7d \
114+
aws-neuronx-tools=2.24.54.0 \
115115
libxml2 \
116116
&& rm -rf /var/lib/apt/lists/* \
117117
&& apt-get clean
@@ -120,15 +120,15 @@ ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}"
120120

121121
# Install manually torch CPU version to avoid pulling CUDA
122122
RUN pip3 install \
123-
torch==2.5.1 \
124-
torchvision==0.20.1 \
123+
torch==2.7.0 \
124+
torchvision==0.22.0 \
125125
--index-url https://download.pytorch.org/whl/cpu
126126

127127
RUN pip3 install \
128-
neuronx-cc==2.17.194.0 \
129-
torch-neuronx==2.5.1.2.6.0 \
130-
neuronx-distributed==0.11.0 \
131-
libneuronxla==2.2.1630.0 \
128+
neuronx-cc==2.19.8089.0+8ab9f450 \
129+
torch-neuronx==2.7.0.2.8.6734+ac864f72 \
130+
neuronx-distributed==0.13.14393+b8569585 \
131+
libneuronxla==2.2.4410.0+835a67fb \
132132
--extra-index-url=https://pip.repos.neuron.amazonaws.com
133133

134134
# Install HuggingFace packages

backends/neuron/tests/server/test_decode.py

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,14 @@ def test_decode(neuron_model_config):
1111
for do_sample in [True, False]:
1212
mode = "sample" if do_sample else "greedy"
1313
print(f"{config_name}[{mode}]")
14-
_test_decode(config_name, generator, do_sample)
14+
generated_text = _test_decode(config_name, generator, do_sample)
15+
if not do_sample:
16+
expected_text = {
17+
"llama": " The world was holding its breath as the world's top scientists and engineers gathered at the secret underground facility",
18+
"qwen2": " I was sitting in my room, staring at the clock, when a knock at the door. I",
19+
"granite": "\n\nThis opening line is from George Orwell's dystopian novel, \"1",
20+
}[config_name]
21+
assert generated_text == expected_text
1522
generator.clear()
1623

1724

@@ -21,7 +28,11 @@ def _test_decode(config_name, generator, do_sample):
2128
)
2229
max_new_tokens = 20
2330
request = create_request(
24-
id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample
31+
id=0,
32+
inputs=input_text,
33+
max_new_tokens=max_new_tokens,
34+
do_sample=do_sample,
35+
temperature=0.9,
2536
)
2637
max_length = generator.model.neuron_config.sequence_length
2738
batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
@@ -38,18 +49,4 @@ def _test_decode(config_name, generator, do_sample):
3849
output = generations[0].generated_text
3950
assert output.generated_tokens == max_new_tokens
4051
assert output.finish_reason == 0
41-
if do_sample:
42-
expected_text = {
43-
"llama": " I sat alone in the café",
44-
"qwen2": " The air was so still",
45-
"granite": "1984, George Orwell",
46-
}[config_name]
47-
assert expected_text in output.text
48-
else:
49-
print(output.text)
50-
expected_text = {
51-
"llama": " The world was holding its breath as the world's top scientists and engineers gathered at the secret underground facility",
52-
"qwen2": " I was sitting in my room, staring at the ceiling, when the door opened and in came a",
53-
"granite": "\n\nThis opening line from George Orwell's dystopian novel \"198",
54-
}[config_name]
55-
assert output.text == expected_text
52+
return output.text

backends/neuron/tests/server/test_prefill.py

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -44,23 +44,17 @@ def _test_prefill(config_name, generator, batch_size, do_sample):
4444
# because of static batching
4545
assert next_batch.max_tokens == batch_size * max_length
4646
assert len(generations) == batch_size
47-
if do_sample:
48-
expectations = {
49-
"llama": [358, " I"],
50-
"qwen2": [576, " The"],
51-
"granite": [308, " ("],
52-
}[config_name]
53-
else:
54-
expectations = {
55-
"llama": [578, " The"],
56-
"qwen2": [358, " I"],
57-
"granite": [203, "\n"],
58-
}[config_name]
59-
for g in generations:
60-
tokens = g.tokens
61-
assert tokens.ids[0] == expectations[0]
62-
assert tokens.texts[0] == expectations[1]
63-
47+
expectations = {
48+
"llama": [578, " The"],
49+
"qwen2": [358, " I"],
50+
"granite": [203, "\n"],
51+
}[config_name]
52+
# Greedy mode should always generate the same output
53+
if not do_sample:
54+
for g in generations:
55+
tokens = g.tokens
56+
assert tokens.ids[0] == expectations[0]
57+
assert tokens.texts[0] == expectations[1]
6458

6559
def test_prefill_truncate(neuron_model_config):
6660
config_name = neuron_model_config["name"]
@@ -88,8 +82,8 @@ def test_prefill_truncate(neuron_model_config):
8882
# be different because of the truncation
8983
expectations = {
9084
"llama": [" He", "iens", "\x08", " He"],
91-
"qwen2": [" He", " The", " He", " He"],
92-
"granite": ["\n", "\n", " I", " He"],
85+
"qwen2": [" He", "<|endoftext|>", " ", " The"],
86+
"granite": ["\n", "\n", "\n", "\n"],
9387
}[config_name]
9488
for i, g in enumerate(generations):
9589
tokens = g.tokens

integration-tests/neuron/test_generate.py

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,22 +22,22 @@ async def test_model_single_request(tgi_service):
2222
greedy_expectations = {
2323
"llama": " and how does it work?\nDeep learning is a subset of machine learning that uses artificial",
2424
"qwen2": " - Deep Learning is a subset of Machine Learning that involves the use of artificial neural networks",
25-
"granite": "\n\nDeep learning is a subset of machine learning techniques based on artificial neural networks",
26-
"qwen3": " A Deep Learning is a subset of machine learning that uses neural networks with multiple layers to",
25+
"granite": "\n\nDeep Learning is a subset of machine learning that is inspired by the structure and",
26+
"qwen3": " And Why Should You Care?\n\nDeep learning is a subset of machine learning that uses neural",
2727
"phi3": "\n\nDeep learning is a subfield of machine learning that focuses on creating",
2828
}
2929
assert response.generated_text == greedy_expectations[service_name]
3030

3131
# Greedy bounded with input
32-
response = await tgi_service.client.text_generation(
32+
greedy_response = await tgi_service.client.text_generation(
3333
"What is Deep Learning?",
3434
max_new_tokens=17,
3535
return_full_text=True,
3636
details=True,
3737
decoder_input_details=True,
3838
)
39-
assert response.details.generated_tokens == 17
40-
assert response.generated_text == prompt + greedy_expectations[service_name]
39+
assert greedy_response.details.generated_tokens == 17
40+
assert greedy_response.generated_text == prompt + greedy_expectations[service_name]
4141

4242
# Sampling
4343
response = await tgi_service.client.text_generation(
@@ -52,16 +52,12 @@ async def test_model_single_request(tgi_service):
5252
# The response must be different
5353
assert not response.startswith(greedy_expectations[service_name])
5454

55-
# Sampling with stop sequence (using one of the words returned from the previous test)
56-
stop_sequence = response.split(" ")[-5]
55+
# Greedy with stop sequence (using one of the words returned from the previous test)
56+
stop_sequence = greedy_response.generated_text.split(" ")[-5]
5757
response = await tgi_service.client.text_generation(
5858
"What is Deep Learning?",
59-
do_sample=True,
60-
top_k=50,
61-
top_p=0.9,
62-
repetition_penalty=1.2,
59+
do_sample=False,
6360
max_new_tokens=128,
64-
seed=42,
6561
stop_sequences=[stop_sequence],
6662
)
6763
assert response.endswith(stop_sequence)
@@ -81,8 +77,8 @@ async def test_model_multiple_requests(tgi_service, neuron_generate_load):
8177
expectations = {
8278
"llama": "Deep learning is a subset of machine learning that uses artificial",
8379
"qwen2": "Deep Learning is a subset of Machine Learning that involves",
84-
"granite": "Deep learning is a subset of machine learning techniques",
85-
"qwen3": "Deep Learning is a subset of machine learning that uses neural networks",
80+
"granite": "Deep Learning is a subset of machine learning that is inspired by the structure and",
81+
"qwen3": " And Why Should You Care?\n\nDeep learning is a subset of machine learning that uses neural",
8682
"phi3": "Deep learning is a subfield of machine learning that focuses on creating",
8783
}
8884
expected = expectations[tgi_service.client.service_name]

0 commit comments

Comments
 (0)