huggingface
diff --git a/‎.github/workflows/nix_build.yaml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/nix_build.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/nix_cache.yaml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/nix_cache.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/nix_tests.yaml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/nix_tests.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎Cargo.lock
Lines changed: 8 additions & 8 deletions b/‎Cargo.lock
Lines changed: 8 additions & 8 deletions
diff --git a/‎Cargo.toml
Lines changed: 1 addition & 1 deletion b/‎Cargo.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎Dockerfile
Lines changed: 1 addition & 10 deletions b/‎Dockerfile
Lines changed: 1 addition & 10 deletions
diff --git a/‎Dockerfile.neuron
Lines changed: 10 additions & 11 deletions b/‎Dockerfile.neuron
Lines changed: 10 additions & 11 deletions
diff --git a/‎Dockerfile.nix
Lines changed: 1 addition & 1 deletion b/‎Dockerfile.nix
Lines changed: 1 addition & 1 deletion
diff --git a/‎Dockerfile_gaudi
Lines changed: 8 additions & 3 deletions b/‎Dockerfile_gaudi
Lines changed: 8 additions & 3 deletions
diff --git a/‎README.md
Lines changed: 4 additions & 4 deletions b/‎README.md
Lines changed: 4 additions & 4 deletions
@@ -21,7 +21,7 @@ jobs:
         nix_path: nixpkgs=channel:nixos-unstable
     - uses: cachix/cachix-action@v14
       with:
-        name: text-generation-inference
+        name: huggingface
         # If you chose signing key for write access
         authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
       env:
 
@@ -20,7 +20,7 @@ jobs:
           nix_path: nixpkgs=channel:nixos-unstable
       - uses: cachix/cachix-action@v14
         with:
-          name: text-generation-inference
+          name: huggingface
           # If you chose signing key for write access
           authToken: "${{ secrets.CACHIX_AUTH_TOKEN }}"
         env:
 
@@ -25,7 +25,7 @@ jobs:
         nix_path: nixpkgs=channel:nixos-unstable
     - uses: cachix/cachix-action@v14
       with:
-        name: text-generation-inference
+        name: huggingface
         # If you chose signing key for write access
         authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
       env:
 
@@ -21,7 +21,7 @@ default-members = [
 resolver = "2"
 
 [workspace.package]
-version = "3.3.0-dev0"
+version = "3.3.2-dev0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
 
@@ -48,7 +48,7 @@ FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS pytorch-install
 WORKDIR /usr/src/
 
 # NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
-ARG PYTORCH_VERSION=2.6
+ARG PYTORCH_VERSION=2.7
 ARG PYTHON_VERSION=3.11
 
 # Keep in sync with `server/pyproject.toml
@@ -121,13 +121,6 @@ COPY server/Makefile-awq Makefile
 # Build specific version of transformers
 RUN . .venv/bin/activate && make build-awq
 
-# Build Lorax Punica kernels
-FROM kernel-builder AS lorax-punica-builder
-WORKDIR /usr/src
-COPY server/Makefile-lorax-punica Makefile
-# Build specific version of transformers
-RUN . .venv/bin/activate && TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-lorax-punica
-
 # Build Transformers CUDA kernels
 FROM kernel-builder AS custom-kernels-builder
 WORKDIR /usr/src
@@ -210,8 +203,6 @@ COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311
 COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 # Copy build artifacts from awq kernels builder
 COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
-# Copy build artifacts from lorax punica kernels builder
-COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 # Copy build artifacts from mamba builder
 COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /usr/src/.venv/lib/python3.11/site-packages
 COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /usr/src/.venv/lib/python3.11/site-packages
 
@@ -5,7 +5,7 @@ RUN mkdir -p /tgi
 # Fetch the optimum-neuron sources directly to avoid relying on pypi deployments
 FROM alpine AS optimum-neuron
 RUN mkdir -p /optimum-neuron
-ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.1.0.tar.gz /optimum-neuron/sources.tar.gz
+ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.2.0.tar.gz /optimum-neuron/sources.tar.gz
 RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1
 
 # Build cargo components (adapted from TGI original Dockerfile)
@@ -108,10 +108,10 @@ RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEU
 # Install neuronx packages
 RUN apt-get update -y \
     && apt-get install -y --no-install-recommends \
-    aws-neuronx-dkms=2.19.64.0 \
-    aws-neuronx-collectives=2.23.135.0-3e70920f2 \
-    aws-neuronx-runtime-lib=2.23.112.0-9b5179492 \
-    aws-neuronx-tools=2.20.204.0 \
+    aws-neuronx-dkms=2.20.28.0 \
+    aws-neuronx-collectives=2.24.59.0-838c7fc8b \
+    aws-neuronx-runtime-lib=2.24.53.0-f239092cc \
+    aws-neuronx-tools=2.22.61.0 \
     libxml2 \
     && rm -rf /var/lib/apt/lists/* \
     && apt-get clean
@@ -125,11 +125,10 @@ RUN pip3 install \
     --index-url https://download.pytorch.org/whl/cpu
 
 RUN pip3 install \
-    neuronx-cc==2.16.372.0 \
-    torch-neuronx==2.5.1.2.4.0 \
-    transformers-neuronx==0.13.322 \
-    neuronx-distributed==0.10.1 \
-    libneuronxla==2.1.681.0 \
+    neuronx-cc==2.17.194.0 \
+    torch-neuronx==2.5.1.2.6.0 \
+    neuronx-distributed==0.11.0 \
+    libneuronxla==2.2.1630.0 \
     --extra-index-url=https://pip.repos.neuron.amazonaws.com
 
 # Install HuggingFace packages
@@ -160,7 +159,7 @@ RUN pip install dist/text_generation_server*.tar.gz
 # Final image
 FROM neuron
 
-COPY backends/neuron/tgi_env.py /tgi_env.py
+COPY backends/neuron/tgi_entry_point.py /tgi_entry_point.py
 COPY backends/neuron/tgi-entrypoint.sh /tgi-entrypoint.sh
 RUN chmod +x /tgi-entrypoint.sh
 
 
@@ -6,7 +6,7 @@
 FROM nixos/nix:2.18.8 AS builder
 RUN echo "experimental-features = nix-command flakes" >> /etc/nix/nix.conf
 RUN nix profile install nixpkgs#cachix
-RUN cachix use text-generation-inference
+RUN cachix use huggingface
 WORKDIR /root
 ADD . .
 RUN nix build .
 
@@ -1,5 +1,5 @@
 # Those arguments are required to build the image
-ARG HABANA_VERSION=1.20.0
+ARG HABANA_VERSION=1.21.0
 ARG PYTORCH_VERSION=2.6.0
 
 # Rust builder
@@ -57,9 +57,12 @@ ARG PYTORCH_VERSION
 
 FROM vault.habana.ai/gaudi-docker/${HABANA_VERSION}/ubuntu22.04/habanalabs/pytorch-installer-${PYTORCH_VERSION}:latest AS base
 
-ENV ATTENTION=default
+ENV ATTENTION=paged
 ENV PREFIX_CACHING=0
 ENV PREFILL_CHUNKING=0
+ENV PT_HPU_LAZY_MODE=1
+ENV PT_HPU_WEIGHT_SHARING=0
+ENV VLLM_EXPONENTIAL_BUCKETING=true
 
 # Text Generation Inference base env
 ENV HF_HOME=/data \
@@ -95,7 +98,9 @@ RUN cd server && \
     pip install "git+https://github.com/HabanaAI/DeepSpeed.git@${HABANA_VERSION}" && \
     BUILD_CUDA_EXT=0 pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@097dd04e --no-build-isolation && \
     pip install . --no-cache-dir
-RUN pip install git+https://github.com/sywangyi/vllm-hpu-extension.git
+RUN pip install git+https://github.com/sywangyi/vllm-hpu-extension.git@bmax_fix
+RUN pip install compressed-tensors==0.9.1
+
 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
 # Install router
 
@@ -84,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta
 volume=$PWD/data
 
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.3.0 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.3.2 --model-id $model
 ```
 
 And then you can make requests like
@@ -121,7 +121,7 @@ curl localhost:8080/v1/chat/completions \
 
 **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
 
-**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.0-rocm --model-id $model` instead of the command above.
+**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.2-rocm --model-id $model` instead of the command above.
 
 To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
 ```
@@ -152,7 +152,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 token=<your cli READ token>
 
 docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.3.0 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.3.2 --model-id $model
 ```
 
 ### A note on Shared Memory (shm)
@@ -256,7 +256,7 @@ Another option is to install `text-generation-inference` locally using [Nix](htt
 we only support Nix on x86_64 Linux with CUDA GPUs. When using Nix, all dependencies can
 be pulled from a binary cache, removing the need to build them locally.
 
-First follow the instructions to [install Cachix and enable the TGI cache](https://app.cachix.org/cache/text-generation-inference).
+First follow the instructions to [install Cachix and enable the Hugging Face cache](https://app.cachix.org/cache/huggingface).
 Setting up the cache is important, otherwise Nix will build many of the dependencies
 locally, which can take hours.