[skyrl-train][dependencies] separate vllm + megatron + bump vllm back to 0.11.0 + pin minimum uv version for extra-build-dependencies (NovaSky-AI#528)

erictang000 · web-flow · commit e8e1cf6e26c5 · 2025-10-24T11:44:42.000-07:00
## Separates vllm + megatron deps After NovaSky-AI#481, there were some megatron flashinfer issues with --extra vllm. This PR separates out the version of vllm that megatron relies on from the general vllm version, allowing us to bump vllm to 0.11.0 for the rest of the training stack. ## Update flash-attn installation Updates flash-attn installation to use the `extra-build-dependencies` feature from uv, requiring us to use a uv version >= 0.8.10. This feature allows us to do the following, removing the need to deal with markers + extras to specify a url source for each set of extras. ``` [tool.uv.extra-build-dependencies] flash-attn = [{requirement = "torch", match-runtime = true}] [tool.uv.extra-build-variables] flash-attn = { FLASH_ATTENTION_SKIP_CUDA_BUILD = "TRUE"} [project.optional-dependencies] vllm = [ "vllm==0.11.0", "flash-attn==2.8.3", ... ] mcore = [ "flash-attn==2.7.4.post1" ... ] ```
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -6,7 +6,7 @@ RUN sudo apt-get update -y && sudo apt-get install -y wget kmod libxml2 build-es
 RUN wget https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux.run \
     && sudo sh cuda_12.8.0_570.86.10_linux.run --silent --toolkit && rm -rf cuda_12.8.0_570.86.10_linux.run
 
-RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+RUN curl -LsSf https://astral.sh/uv/0.9.4/install.sh | sh
 RUN echo "export RAY_RUNTIME_ENV_HOOK=ray._private.runtime_env.uv_runtime_env_hook.hook" >> /home/ray/.bashrc
 
 RUN sudo apt-get update \
diff --git a/docker/Dockerfile.megatron b/docker/Dockerfile.megatron
@@ -6,7 +6,7 @@ RUN sudo apt-get update -y && sudo apt-get install -y wget kmod libxml2 build-es
 RUN wget https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux.run \
     && sudo sh cuda_12.8.0_570.86.10_linux.run --silent --toolkit && rm -rf cuda_12.8.0_570.86.10_linux.run
 
-RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+RUN curl -LsSf https://astral.sh/uv/0.9.4/install.sh | sh
 RUN echo "export RAY_RUNTIME_ENV_HOOK=ray._private.runtime_env.uv_runtime_env_hook.hook" >> /home/ray/.bashrc
 
 
diff --git a/skyrl-train/docs/examples/megatron.rst b/skyrl-train/docs/examples/megatron.rst
@@ -104,13 +104,8 @@ After following the installation instructions, set the following environment var
 
 Flash Attention
 ~~~~~~~~~~~~~~~
-Next, in order to use flash attention with the megatron backend, you must use ``flash_attn`` version ``2.7.4.post1`` or lower for compatibility with ``TransformerEngine==2.5.0``.
-You can replace the ``flash-attn`` wheel in the ``pyproject.toml`` file with the following to use the ``2.7.4.post1`` release, and you can find wheels for other versions `here <https://github.com/Dao-AILab/flash-attention/releases>`_.
-
-.. code-block:: bash
-
-    flash-attn = { url = "https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.7cxx11abiFALSE-cp312-cp312-linux_x86_64.whl" }
-
+In order to use flash attention with the megatron backend, you must use ``flash_attn`` version ``2.7.4.post1`` or lower for compatibility with ``TransformerEngine==2.5.0``.
+This is handled in the ``pyproject.toml`` file for the ``mcore`` extra.
 
 Configuration
 -------------
diff --git a/skyrl-train/examples/megatron/run_megatron.sh b/skyrl-train/examples/megatron/run_megatron.sh
@@ -26,7 +26,7 @@ export SKYRL_PYTHONPATH_EXPORT=1
 # make sure PYTHONPATH is set to the location of TransformerEngine installation
 export PYTHONPATH="$HOME/anaconda3/lib/python3.12/site-packages"
 
-uv run --isolated --extra $INFERENCE_BACKEND --extra mcore -m skyrl_train.entrypoints.main_base \
+uv run --isolated --extra mcore -m skyrl_train.entrypoints.main_base \
   data.train_data="['$DATA_DIR/train.parquet']" \
   data.val_data="['$DATA_DIR/validation.parquet']" \
   trainer.algorithm.advantage_estimator="grpo" \
diff --git a/skyrl-train/examples/megatron/run_megatron_moonlight.sh b/skyrl-train/examples/megatron/run_megatron_moonlight.sh
@@ -36,7 +36,7 @@ export SKYRL_PYTHONPATH_EXPORT=1
 # make sure PYTHONPATH is set to the location of TransformerEngine installation
 export PYTHONPATH="$HOME/anaconda3/lib/python3.12/site-packages"
 
-uv run --isolated --extra $INFERENCE_BACKEND --extra mcore --with blobfile -m skyrl_train.entrypoints.main_base \
+uv run --isolated --extra mcore --with blobfile -m skyrl_train.entrypoints.main_base \
   data.train_data="['$DATA_DIR/train.parquet']" \
   data.val_data="['$DATA_DIR/validation.parquet']" \
   trainer.algorithm.advantage_estimator="grpo" \
diff --git a/skyrl-train/examples/megatron/run_megatron_qwen3-235b-a22b.sh b/skyrl-train/examples/megatron/run_megatron_qwen3-235b-a22b.sh
@@ -51,7 +51,7 @@ export SKYRL_PYTHONPATH_EXPORT=1
 # make sure PYTHONPATH is set to the location of TransformerEngine installation
 export PYTHONPATH="$HOME/anaconda3/lib/python3.12/site-packages"
 
-uv run --isolated --extra $INFERENCE_BACKEND --extra mcore -m skyrl_train.entrypoints.main_base \
+uv run --isolated --extra mcore -m skyrl_train.entrypoints.main_base \
   data.train_data="['$DATA_DIR/train.parquet']" \
   data.val_data="['$DATA_DIR/validation.parquet']" \
   trainer.algorithm.advantage_estimator="grpo" \
diff --git a/skyrl-train/examples/megatron/run_megatron_qwen3-30b-a3b.sh b/skyrl-train/examples/megatron/run_megatron_qwen3-30b-a3b.sh
@@ -35,7 +35,7 @@ export SKYRL_PYTHONPATH_EXPORT=1
 # make sure PYTHONPATH is set to the location of TransformerEngine installation
 export PYTHONPATH="$HOME/anaconda3/lib/python3.12/site-packages"
 
-uv run --isolated --extra $INFERENCE_BACKEND --extra mcore -m skyrl_train.entrypoints.main_base \
+uv run --isolated --extra mcore -m skyrl_train.entrypoints.main_base \
   data.train_data="['$DATA_DIR/train.parquet']" \
   data.val_data="['$DATA_DIR/validation.parquet']" \
   trainer.algorithm.advantage_estimator="grpo" \
diff --git a/skyrl-train/examples/megatron/run_search_megatron.sh b/skyrl-train/examples/megatron/run_search_megatron.sh
@@ -30,7 +30,7 @@ export SKYRL_PYTHONPATH_EXPORT=1
 # make sure PYTHONPATH is set to the location of TransformerEngine installation
 export PYTHONPATH="$HOME/anaconda3/lib/python3.12/site-packages"
 
-uv run --isolated --frozen --extra mcore --extra vllm -m skyrl_train.entrypoints.main_base \
+uv run --isolated --frozen --extra mcore -m skyrl_train.entrypoints.main_base \
   data.train_data="['${DATA_DIR}/train.parquet']" \
   data.val_data="['${DATA_DIR}/validation.parquet']" \
   trainer.algorithm.advantage_estimator="grpo" \
diff --git a/skyrl-train/pyproject.toml b/skyrl-train/pyproject.toml
@@ -45,6 +45,7 @@ dependencies = [
 ]
 
 [tool.uv]
+required-version = ">=0.8.10"
 conflicts = [
     [
         { extra = "vllm" },
@@ -61,22 +62,28 @@ conflicts = [
     ],
     [
         { extra = "mcore" },
+        { extra = "vllm" },
         { extra = "sglang" },
         { extra = "flashrl" },
     ]
 ]
 
+[tool.uv.extra-build-dependencies]
+flash-attn = [{requirement = "torch", match-runtime = true}]
+
+[tool.uv.extra-build-variables]
+flash-attn = { FLASH_ATTENTION_SKIP_CUDA_BUILD = "TRUE"}
+
 [tool.uv.sources]
 skyrl-gym = { path = "./skyrl-gym" , editable = true }
 torch = { index = "pytorch-cu128" }
 torchvision = { index = "pytorch-cu128" }
-flash-attn = { url = "https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.0.post2/flash_attn-2.8.0.post2+cu12torch2.7cxx11abiFALSE-cp312-cp312-linux_x86_64.whl" }
-# NOTE (sumanthrh): We explictly use a flashinfer wheel from their index.
-# The wheels on PyPI don't come with pre-compiled kernels and the package will JIT compile them at runtime which is slow.
-# additionally, different inference engines may pin different compatible flashinfer versions, so we provide the option to pin different versions for vllm/sglang
+# We use `flashinfer-jit-cache` to avoid slow JIT compilation on first run.
+# Different inference engines may pin different compatible flashinfer versions, so we provide the option to pin different versions for vllm/sglang
+flashinfer-jit-cache = { index = "flashinfer-cu128", marker = "extra == 'vllm'" }
 flashinfer-python = [
-    { url = "https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl", marker = "extra =='vllm'" },
-    { url = "https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl", marker = "extra == 'sglang' and extra != 'vllm'" }
+    { url = "https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl", marker = "extra == 'mcore' and extra != 'vllm'" },
+    { url = "https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl", marker = "extra == 'sglang' and extra != 'mcore' and extra != 'vllm'" }
 ]
 
 [project.optional-dependencies]
@@ -104,14 +111,17 @@ sandboxes = [
     "litellm[proxy]>=1.67.5",
 ]
 vllm = [
-    "vllm==0.10.1.1",
-    "torch==2.7.1",
+    "vllm==0.11.0",
+    "flash-attn==2.8.3",
+    "torch==2.8.0",
     "flashinfer-python",
+    "flashinfer-jit-cache",
     "torchvision"
 ]
 sglang = [
     "sglang[srt,openai,torch_memory_saver]==0.4.8.post1",  # 0.4.9.post1 causes non-colocate weight broadcast to hang
     "flashinfer-python",
+    "flash-attn==2.8.3",
     "torch==2.7.1",
     "torchvision",
 ]
@@ -126,12 +136,18 @@ mcore = [
   # export LD_LIBRARY_PATH="$CUDNN_PATH/lib:${LD_LIBRARY_PATH:-}"
   # uv pip install --no-build-isolation "transformer_engine[pytorch]==2.5.0" --verbose
   # "transformer-engine[pytorch]==2.5.0",
+  "flash-attn==2.7.4.post1",
+  "vllm==0.10.1.1",
+  "torch==2.7.1",
+  "flashinfer-python",
+  "torchvision",
   "mbridge==0.15.1",
   "megatron-core==0.13.0",
 ]
 flashrl = [
     # NOTE: Custom vLLM wheel must be installed separately.
     # See examples/flash_rl/README.md for installation instructions.
+    "flash-attn==2.8.3",
     "torch==2.7.0",
     "flashinfer-python",
     "torchvision",
@@ -148,6 +164,11 @@ name = "pytorch-cu128"
 url = "https://download.pytorch.org/whl/cu128"
 explicit = true
 
+[[tool.uv.index]]
+name = "flashinfer-cu128"
+url = "https://flashinfer.ai/whl/cu128"
+explicit = true
+
 [tool.setuptools]
 include-package-data = true
 
diff --git a/skyrl-train/tests/gpu/test_megatron_worker.py b/skyrl-train/tests/gpu/test_megatron_worker.py
@@ -1,6 +1,7 @@
 """
 Run with:
-SKYRL_PYTHONPATH_EXPORT=1 uv run --isolated --extra dev --extra vllm --extra mcore -- pytest tests/gpu/test_megatron_worker.py
+export PYTHONPATH=/home/ray/anaconda3/lib/python3.12/site-packages
+SKYRL_PYTHONPATH_EXPORT=1 uv run --isolated --extra dev --extra mcore -- pytest tests/gpu/test_megatron_worker.py
 """
 
 import ray
diff --git a/skyrl-train/uv.lock b/skyrl-train/uv.lock