Optimum neuron 0.3.0 (#3308)

tengomucho · dacorvo · web-flow · commit 8801ba12cffc · 2025-08-26T11:07:47.000+02:00
* chore(neuron): update to optimum-neuron 0.3.0

Dependencies were changed accordingly, because Neuron SDK was updated to
v2.24.

* test: sample is not deterministic

Also modify the temperature in decode test to avoid granite early
stopping.

* test(neuron): adjust expectations after graph changes

* test(neuron): use greedy for stop sequences

---------

Co-authored-by: David Corvoysier &lt;david@huggingface.co&gt;
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
@@ -5,7 +5,7 @@ RUN mkdir -p /tgi
 # Fetch the optimum-neuron sources directly to avoid relying on pypi deployments
 FROM alpine AS optimum-neuron
 RUN mkdir -p /optimum-neuron
-ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.2.2.tar.gz /optimum-neuron/sources.tar.gz
+ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.3.0.tar.gz /optimum-neuron/sources.tar.gz
 RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1
 
 # Build cargo components (adapted from TGI original Dockerfile)
@@ -108,10 +108,10 @@ RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEU
 # Install neuronx packages
 RUN apt-get update -y \
     && apt-get install -y --no-install-recommends \
-    aws-neuronx-dkms=2.20.28.0 \
-    aws-neuronx-collectives=2.24.59.0-838c7fc8b \
-    aws-neuronx-runtime-lib=2.24.53.0-f239092cc \
-    aws-neuronx-tools=2.22.61.0 \
+    aws-neuronx-dkms=2.22.2.0 \
+    aws-neuronx-collectives=2.26.43.0-47cc904ea \
+    aws-neuronx-runtime-lib=2.26.42.0-2ff3b5c7d  \
+    aws-neuronx-tools=2.24.54.0 \
     libxml2 \
     && rm -rf /var/lib/apt/lists/* \
     && apt-get clean
@@ -120,15 +120,15 @@ ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}"
 
 # Install manually torch CPU version to avoid pulling CUDA
 RUN pip3 install \
-    torch==2.5.1 \
-    torchvision==0.20.1 \
+    torch==2.7.0 \
+    torchvision==0.22.0 \
     --index-url https://download.pytorch.org/whl/cpu
 
 RUN pip3 install \
-    neuronx-cc==2.17.194.0 \
-    torch-neuronx==2.5.1.2.6.0 \
-    neuronx-distributed==0.11.0 \
-    libneuronxla==2.2.1630.0 \
+    neuronx-cc==2.19.8089.0+8ab9f450 \
+    torch-neuronx==2.7.0.2.8.6734+ac864f72 \
+    neuronx-distributed==0.13.14393+b8569585 \
+    libneuronxla==2.2.4410.0+835a67fb \
     --extra-index-url=https://pip.repos.neuron.amazonaws.com
 
 # Install HuggingFace packages
diff --git a/backends/neuron/tests/server/test_decode.py b/backends/neuron/tests/server/test_decode.py
@@ -11,7 +11,14 @@ def test_decode(neuron_model_config):
     for do_sample in [True, False]:
         mode = "sample" if do_sample else "greedy"
         print(f"{config_name}[{mode}]")
-        _test_decode(config_name, generator, do_sample)
+        generated_text = _test_decode(config_name, generator, do_sample)
+        if not do_sample:
+            expected_text = {
+                "llama": " The world was holding its breath as the world's top scientists and engineers gathered at the secret underground facility",
+                "qwen2": " I was sitting in my room, staring at the clock, when a knock at the door. I",
+                "granite": "\n\nThis opening line is from George Orwell's dystopian novel, \"1",
+            }[config_name]
+            assert generated_text == expected_text
         generator.clear()
 
 
@@ -21,7 +28,11 @@ def _test_decode(config_name, generator, do_sample):
     )
     max_new_tokens = 20
     request = create_request(
-        id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample
+        id=0,
+        inputs=input_text,
+        max_new_tokens=max_new_tokens,
+        do_sample=do_sample,
+        temperature=0.9,
     )
     max_length = generator.model.neuron_config.sequence_length
     batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
@@ -38,18 +49,4 @@ def _test_decode(config_name, generator, do_sample):
     output = generations[0].generated_text
     assert output.generated_tokens == max_new_tokens
     assert output.finish_reason == 0
-    if do_sample:
-        expected_text = {
-            "llama": " I sat alone in the café",
-            "qwen2": " The air was so still",
-            "granite": "1984, George Orwell",
-        }[config_name]
-        assert expected_text in output.text
-    else:
-        print(output.text)
-        expected_text = {
-            "llama": " The world was holding its breath as the world's top scientists and engineers gathered at the secret underground facility",
-            "qwen2": " I was sitting in my room, staring at the ceiling, when the door opened and in came a",
-            "granite": "\n\nThis opening line from George Orwell's dystopian novel \"198",
-        }[config_name]
-        assert output.text == expected_text
+    return output.text
diff --git a/backends/neuron/tests/server/test_prefill.py b/backends/neuron/tests/server/test_prefill.py
@@ -44,23 +44,17 @@ def _test_prefill(config_name, generator, batch_size, do_sample):
     # because of static batching
     assert next_batch.max_tokens == batch_size * max_length
     assert len(generations) == batch_size
-    if do_sample:
-        expectations = {
-            "llama": [358, " I"],
-            "qwen2": [576, " The"],
-            "granite": [308, " ("],
-        }[config_name]
-    else:
-        expectations = {
-            "llama": [578, " The"],
-            "qwen2": [358, " I"],
-            "granite": [203, "\n"],
-        }[config_name]
-    for g in generations:
-        tokens = g.tokens
-        assert tokens.ids[0] == expectations[0]
-        assert tokens.texts[0] == expectations[1]
-
+    expectations = {
+        "llama": [578, " The"],
+        "qwen2": [358, " I"],
+        "granite": [203, "\n"],
+    }[config_name]
+    # Greedy mode should always generate the same output
+    if not do_sample:
+        for g in generations:
+            tokens = g.tokens
+            assert tokens.ids[0] == expectations[0]
+            assert tokens.texts[0] == expectations[1]
 
 def test_prefill_truncate(neuron_model_config):
     config_name = neuron_model_config["name"]
@@ -88,8 +82,8 @@ def test_prefill_truncate(neuron_model_config):
     # be different because of the truncation
     expectations = {
         "llama": [" He", "iens", "\x08", " He"],
-        "qwen2": [" He", " The", " He", " He"],
-        "granite": ["\n", "\n", " I", " He"],
+        "qwen2": [" He", "<|endoftext|>", " ", " The"],
+        "granite": ["\n", "\n", "\n", "\n"],
     }[config_name]
     for i, g in enumerate(generations):
         tokens = g.tokens
diff --git a/integration-tests/neuron/test_generate.py b/integration-tests/neuron/test_generate.py
@@ -22,22 +22,22 @@ async def test_model_single_request(tgi_service):
     greedy_expectations = {
         "llama": " and how does it work?\nDeep learning is a subset of machine learning that uses artificial",
         "qwen2": " - Deep Learning is a subset of Machine Learning that involves the use of artificial neural networks",
-        "granite": "\n\nDeep learning is a subset of machine learning techniques based on artificial neural networks",
-        "qwen3": " A Deep Learning is a subset of machine learning that uses neural networks with multiple layers to",
+        "granite": "\n\nDeep Learning is a subset of machine learning that is inspired by the structure and",
+        "qwen3": " And Why Should You Care?\n\nDeep learning is a subset of machine learning that uses neural",
         "phi3": "\n\nDeep learning is a subfield of machine learning that focuses on creating",
     }
     assert response.generated_text == greedy_expectations[service_name]
 
     # Greedy bounded with input
-    response = await tgi_service.client.text_generation(
+    greedy_response = await tgi_service.client.text_generation(
         "What is Deep Learning?",
         max_new_tokens=17,
         return_full_text=True,
         details=True,
         decoder_input_details=True,
     )
-    assert response.details.generated_tokens == 17
-    assert response.generated_text == prompt + greedy_expectations[service_name]
+    assert greedy_response.details.generated_tokens == 17
+    assert greedy_response.generated_text == prompt + greedy_expectations[service_name]
 
     # Sampling
     response = await tgi_service.client.text_generation(
@@ -52,16 +52,12 @@ async def test_model_single_request(tgi_service):
     # The response must be different
     assert not response.startswith(greedy_expectations[service_name])
 
-    # Sampling with stop sequence (using one of the words returned from the previous test)
-    stop_sequence = response.split(" ")[-5]
+    # Greedy with stop sequence (using one of the words returned from the previous test)
+    stop_sequence = greedy_response.generated_text.split(" ")[-5]
     response = await tgi_service.client.text_generation(
         "What is Deep Learning?",
-        do_sample=True,
-        top_k=50,
-        top_p=0.9,
-        repetition_penalty=1.2,
+        do_sample=False,
         max_new_tokens=128,
-        seed=42,
         stop_sequences=[stop_sequence],
     )
     assert response.endswith(stop_sequence)
@@ -81,8 +77,8 @@ async def test_model_multiple_requests(tgi_service, neuron_generate_load):
     expectations = {
         "llama": "Deep learning is a subset of machine learning that uses artificial",
         "qwen2": "Deep Learning is a subset of Machine Learning that involves",
-        "granite": "Deep learning is a subset of machine learning techniques",
-        "qwen3": "Deep Learning is a subset of machine learning that uses neural networks",
+        "granite": "Deep Learning is a subset of machine learning that is inspired by the structure and",
+        "qwen3": " And Why Should You Care?\n\nDeep learning is a subset of machine learning that uses neural",
         "phi3": "Deep learning is a subfield of machine learning that focuses on creating",
     }
     expected = expectations[tgi_service.client.service_name]