From cb0fd128974f74c307f9cafdd2f1c387039ca50f Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 3 Sep 2025 14:49:32 +0800
Subject: [PATCH 1/8] make sharding_first by default

---
 paddlenlp/trainer/training_args.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py
index 82cd661ff78f..ec005732ac55 100644
--- a/paddlenlp/trainer/training_args.py
+++ b/paddlenlp/trainer/training_args.py
@@ -792,7 +792,7 @@ class TrainingArguments:
                 "Following options are supported:\n"
                 "- pp_first. the topo order is dp, pp, sharding, mp \n"
                 "- sharding_first. the topo order is dp, sharding, pp, mp \n"
-                "Default is None, for pp_first"
+                "Default is None, for sharding_first"
             )
         },
     )
@@ -2107,7 +2107,7 @@ def _post_init_parallel_degree(self):
                 self.expert_tensor_parallel_degree = -1
 
         if self.hybrid_parallel_topo_order is None:
-            self.hybrid_parallel_topo_order = "pp_first"
+            self.hybrid_parallel_topo_order = "sharding_first"
         assert self.hybrid_parallel_topo_order in ["pp_first", "sharding_first"]
 
         if self.use_hybrid_parallel and self.enable_auto_parallel:

From 962665c70e4752e0d353366918024a9d1aaa1023 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 3 Sep 2025 19:23:54 +0800
Subject: [PATCH 2/8] update run_ci.sh

---
 scripts/distribute/run_ci.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/distribute/run_ci.sh b/scripts/distribute/run_ci.sh
index 7cc072210f30..42f6f82c4965 100644
--- a/scripts/distribute/run_ci.sh
+++ b/scripts/distribute/run_ci.sh
@@ -29,6 +29,7 @@ global_verification_fail_arr=()
 target_lists_for_gpt=(
     "slm/model_zoo/gpt-3"
     "llm/auto_parallel/gpt-3"
+    "paddlenlp/trainer/training_args.py"
     "paddlenlp/transformers/gpt"
     "scripts/distribute"
     ".github/workflows/distribute.yml"
@@ -37,6 +38,7 @@ target_lists_for_gpt=(
 target_lists_for_llama=(
     "llm/auto_parallel/llama"
     "paddlenlp/trainer/auto_trainer.py"
+    "paddlenlp/trainer/training_args.py"
     "paddlenlp/transformers/llama"
     "scripts/distribute"
     ".github/workflows/distribute.yml"
@@ -45,6 +47,7 @@ target_lists_for_llama=(
 target_lists_for_deepseek=(
     "llm/auto_parallel/deepseek-v3"
     "paddlenlp/trainer/auto_trainer.py"
+    "paddlenlp/trainer/training_args.py"
     "paddlenlp/transformers/deepseek_v2/modeling_auto.py"
     "paddlenlp/transformers/deepseek_v2/modeling.py"
     "paddlenlp/transformers/deepseek_v3/modeling_auto.py"

From 366b37302fb233edb4ac657f9ca24ebe4955b41d Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 5 Sep 2025 16:53:49 +0800
Subject: [PATCH 3/8] Update V100 baseline

---
 scripts/distribute/ci_case_auto.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh
index 8003b45af5b5..2c7bec8fdcf6 100755
--- a/scripts/distribute/ci_case_auto.sh
+++ b/scripts/distribute/ci_case_auto.sh
@@ -2590,7 +2590,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2() {
     ips=-1
     mem=-1
     echo "result: loss=$loss ips=$ips mem=$mem loss_md5=$loss_md5"
-    loss_base=10.55727577 # output of dropout is different after supporting spmd
+    loss_base=10.556077 # output of dropout is different after supporting spmd
     ips_base=-1
     mem_base=-1
     if [ $IS_A100 -ne 0 ];then
@@ -2662,7 +2662,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2() {
     ips=-1
     mem=-1
     echo "result: loss=$loss ips=$ips mem=$mem loss_md5=$loss_md5"
-    loss_base=10.57985115 # output of dropout is different after supporting spmd
+    loss_base=10.58583546 # output of dropout is different after supporting spmd
     ips_base=-1
     mem_base=-1
     if [ $IS_A100 -ne 0 ];then
@@ -2735,7 +2735,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2-PP2() {
     mem=-1
     echo "result: loss=$loss ips=$ips mem=$mem loss_md5=$loss_md5"
     # loss_base=10.59993172     # note: need to debug
-    loss_base=10.57274055 # output of dropout is different after supporting spmd
+    loss_base=10.57335854 # output of dropout is different after supporting spmd
     ips_base=-1
     mem_base=-1
     if [ $IS_A100 -ne 0 ];then
@@ -2808,7 +2808,7 @@ function llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2() {
     mem=-1
     echo "result: loss=$loss ips=$ips mem=$mem loss_md5=$loss_md5"
     # loss_base=10.58456802     # note: need to debug
-    loss_base=10.57409477
+    loss_base=10.57439137
     ips_base=-1
     mem_base=-1
     if [ $IS_A100 -ne 0 ];then

From a8dc65b324c4a5c7fd9b29b439c3c1725bdcf871 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 5 Sep 2025 16:58:16 +0800
Subject: [PATCH 4/8] Update A100 baselines

---
 scripts/distribute/ci_case_auto.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh
index 2c7bec8fdcf6..7a293b9c20ac 100755
--- a/scripts/distribute/ci_case_auto.sh
+++ b/scripts/distribute/ci_case_auto.sh
@@ -933,7 +933,7 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2_hybrid_pp() {
         if [ $IS_CUDA123 -ne 0 ];then
             loss_base=9.57173729
         else
-            loss_base=9.57199001
+            loss_base=9.57873344
         fi
         ips_base=-1
         mem_base=-1
@@ -2594,7 +2594,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2() {
     ips_base=-1
     mem_base=-1
     if [ $IS_A100 -ne 0 ];then
-        loss_base=10.56668472 # after add dropout spmd
+        loss_base=10.55707741 # after add dropout spmd
     fi
     check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="
@@ -2666,7 +2666,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2() {
     ips_base=-1
     mem_base=-1
     if [ $IS_A100 -ne 0 ];then
-        loss_base=10.57280159 # after add dropout spmd
+        loss_base=10.5818882 # after add dropout spmd
     fi
     check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="
@@ -2739,7 +2739,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2-PP2() {
     ips_base=-1
     mem_base=-1
     if [ $IS_A100 -ne 0 ];then
-        loss_base=10.57785797 # after add dropout spmd
+        loss_base=10.59062004 # after add dropout spmd
     fi
     check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="
@@ -2812,7 +2812,7 @@ function llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2() {
     ips_base=-1
     mem_base=-1
     if [ $IS_A100 -ne 0 ];then
-        loss_base=10.57924652 # after add dropout spmd
+        loss_base=10.59183884 # after add dropout spmd
     fi
     check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="

From e507a766430ed0941b8ad8657cae559d69125a2a Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 5 Sep 2025 22:44:45 +0800
Subject: [PATCH 5/8] Revert "Update A100 baselines"

This reverts commit c682b2752a83b0e532a15a6b6f90ce08e386e447.
---
 scripts/distribute/ci_case_auto.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh
index 7a293b9c20ac..2c7bec8fdcf6 100755
--- a/scripts/distribute/ci_case_auto.sh
+++ b/scripts/distribute/ci_case_auto.sh
@@ -933,7 +933,7 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2_hybrid_pp() {
         if [ $IS_CUDA123 -ne 0 ];then
             loss_base=9.57173729
         else
-            loss_base=9.57873344
+            loss_base=9.57199001
         fi
         ips_base=-1
         mem_base=-1
@@ -2594,7 +2594,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2() {
     ips_base=-1
     mem_base=-1
     if [ $IS_A100 -ne 0 ];then
-        loss_base=10.55707741 # after add dropout spmd
+        loss_base=10.56668472 # after add dropout spmd
     fi
     check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="
@@ -2666,7 +2666,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2() {
     ips_base=-1
     mem_base=-1
     if [ $IS_A100 -ne 0 ];then
-        loss_base=10.5818882 # after add dropout spmd
+        loss_base=10.57280159 # after add dropout spmd
     fi
     check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="
@@ -2739,7 +2739,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2-PP2() {
     ips_base=-1
     mem_base=-1
     if [ $IS_A100 -ne 0 ];then
-        loss_base=10.59062004 # after add dropout spmd
+        loss_base=10.57785797 # after add dropout spmd
     fi
     check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="
@@ -2812,7 +2812,7 @@ function llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2() {
     ips_base=-1
     mem_base=-1
     if [ $IS_A100 -ne 0 ];then
-        loss_base=10.59183884 # after add dropout spmd
+        loss_base=10.57924652 # after add dropout spmd
     fi
     check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="

From d8ce0c9a641a1a774bc34c0f19ad2e006d5dffff Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 5 Sep 2025 22:45:00 +0800
Subject: [PATCH 6/8] Revert "Update V100 baseline"

This reverts commit 1ca17b7963d1912bc98c27a5ec1a4bfe0bf8abda.
---
 scripts/distribute/ci_case_auto.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh
index 2c7bec8fdcf6..8003b45af5b5 100755
--- a/scripts/distribute/ci_case_auto.sh
+++ b/scripts/distribute/ci_case_auto.sh
@@ -2590,7 +2590,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2() {
     ips=-1
     mem=-1
     echo "result: loss=$loss ips=$ips mem=$mem loss_md5=$loss_md5"
-    loss_base=10.556077 # output of dropout is different after supporting spmd
+    loss_base=10.55727577 # output of dropout is different after supporting spmd
     ips_base=-1
     mem_base=-1
     if [ $IS_A100 -ne 0 ];then
@@ -2662,7 +2662,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2() {
     ips=-1
     mem=-1
     echo "result: loss=$loss ips=$ips mem=$mem loss_md5=$loss_md5"
-    loss_base=10.58583546 # output of dropout is different after supporting spmd
+    loss_base=10.57985115 # output of dropout is different after supporting spmd
     ips_base=-1
     mem_base=-1
     if [ $IS_A100 -ne 0 ];then
@@ -2735,7 +2735,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2-PP2() {
     mem=-1
     echo "result: loss=$loss ips=$ips mem=$mem loss_md5=$loss_md5"
     # loss_base=10.59993172     # note: need to debug
-    loss_base=10.57335854 # output of dropout is different after supporting spmd
+    loss_base=10.57274055 # output of dropout is different after supporting spmd
     ips_base=-1
     mem_base=-1
     if [ $IS_A100 -ne 0 ];then
@@ -2808,7 +2808,7 @@ function llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2() {
     mem=-1
     echo "result: loss=$loss ips=$ips mem=$mem loss_md5=$loss_md5"
     # loss_base=10.58456802     # note: need to debug
-    loss_base=10.57439137
+    loss_base=10.57409477
     ips_base=-1
     mem_base=-1
     if [ $IS_A100 -ne 0 ];then

From b7ac3723b804328d3db763dd9135f162a4cb1b77 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 5 Sep 2025 22:45:55 +0800
Subject: [PATCH 7/8] fix topo in CI cases

---
 scripts/distribute/ci_case_auto.sh | 46 ++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh
index 8003b45af5b5..91d9f89e849f 100755
--- a/scripts/distribute/ci_case_auto.sh
+++ b/scripts/distribute/ci_case_auto.sh
@@ -25,6 +25,8 @@ export llama_data_path=/llama_data
 export llm_gpt_case_path=$root_path/llm/auto_parallel/gpt-3
 export gpt_data_path=/fleetx_data
 
+DEFAULT_TOPO=pp_first
+
 unset CUDA_VISIBLE_DEVICES
 
 function is_a100() {
@@ -256,6 +258,7 @@ function llama_dygraph_auto_bs4_bf16_SD2() {
                 ./run_pretrain_auto.py \
                 --model_name_or_path "meta-llama/Llama-2-7b" \
                 --tokenizer_name_or_path "meta-llama/Llama-2-7b" \
+                --hybrid_parallel_topo_order $DEFAULT_TOPO \
                 --input_dir "./data" \
                 --output_dir "./output" \
                 --weight_decay 0.01 \
@@ -358,6 +361,7 @@ function llama_dygraph_auto_bs8_fp32_DP2() {
         --model_type "llama" \
         --model_name_or_path "facebook/llama-7b" \
         --tokenizer_name_or_path "facebook/llama-7b" \
+        --hybrid_parallel_topo_order $DEFAULT_TOPO \
         --input_dir "./data" \
         --output_dir $case_out_dir \
         --split 949,50,1 \
@@ -429,6 +433,7 @@ function llama_dygraph_auto_bs8_fp32_DP2-MP2() {
         --model_type "llama" \
         --model_name_or_path "facebook/llama-7b" \
         --tokenizer_name_or_path "facebook/llama-7b" \
+        --hybrid_parallel_topo_order $DEFAULT_TOPO \
         --input_dir "./data" \
         --output_dir $case_out_dir \
         --split 949,50,1 \
@@ -511,6 +516,7 @@ function llama_dygraph_auto_bs8_fp32_DP2-MP2-PP2() {
                 --model_type "llama" \
                 --model_name_or_path "facebook/llama-7b" \
                 --tokenizer_name_or_path "facebook/llama-7b" \
+                --hybrid_parallel_topo_order $DEFAULT_TOPO \
                 --input_dir "./data" \
                 --output_dir $case_out_dir \
                 --split 949,50,1 \
@@ -584,6 +590,7 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2() {
         --model_type "llama" \
         --model_name_or_path "facebook/llama-7b" \
         --tokenizer_name_or_path "facebook/llama-7b" \
+        --hybrid_parallel_topo_order $DEFAULT_TOPO \
         --input_dir "./data" \
         --output_dir $case_out_dir \
         --split 949,50,1 \
@@ -659,6 +666,7 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2_intermediate() {
         --use_intermediate_api 1\
         --model_name_or_path "facebook/llama-7b" \
         --tokenizer_name_or_path "facebook/llama-7b" \
+        --hybrid_parallel_topo_order $DEFAULT_TOPO \
         --input_dir "./data" \
         --output_dir $case_out_dir \
         --split 949,50,1 \
@@ -732,6 +740,7 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-CP2() {
             --model_type "llama" \
             --model_name_or_path "facebook/llama-7b" \
             --tokenizer_name_or_path "facebook/llama-7b" \
+            --hybrid_parallel_topo_order $DEFAULT_TOPO \
             --input_dir "./data" \
             --output_dir $case_out_dir \
             --split 949,50,1 \
@@ -806,6 +815,7 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-CP2_intermediate() {
         python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" --log_dir $case_log_dir run_pretrain_auto.py \
             --model_name_or_path "facebook/llama-7b" \
             --tokenizer_name_or_path "facebook/llama-7b" \
+            --hybrid_parallel_topo_order $DEFAULT_TOPO \
             --input_dir "./data" \
             --output_dir $case_out_dir \
             --split 949,50,1 \
@@ -883,6 +893,7 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2_hybrid_pp() {
             --model_type "llama_pp" \
             --model_name_or_path "facebook/llama-7b" \
             --tokenizer_name_or_path "facebook/llama-7b" \
+            --hybrid_parallel_topo_order $DEFAULT_TOPO \
             --input_dir "./data" \
             --output_dir $case_out_dir \
             --split 949,50,1 \
@@ -950,6 +961,7 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2_hybrid_pp() {
             --model_type "llama_pp" \
             --model_name_or_path "facebook/llama-7b" \
             --tokenizer_name_or_path "facebook/llama-7b" \
+            --hybrid_parallel_topo_order $DEFAULT_TOPO \
             --input_dir "./data" \
             --output_dir $auto_case_out_dir \
             --split 949,50,1 \
@@ -1031,6 +1043,7 @@ function llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2() {
             ./run_pretrain_auto.py \
             --model_name_or_path "meta-llama/Llama-2-13b" \
             --tokenizer_name_or_path "meta-llama/Llama-2-13b" \
+            --hybrid_parallel_topo_order $DEFAULT_TOPO \
             --input_dir "./data" \
             --output_dir "./output" \
             --split 949,50,1 \
@@ -1131,6 +1144,7 @@ function llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2-VPP3_split_bw() {
             ./run_pretrain_auto.py \
             --model_name_or_path "meta-llama/Llama-2-13b" \
             --tokenizer_name_or_path "meta-llama/Llama-2-13b" \
+            --hybrid_parallel_topo_order $DEFAULT_TOPO \
             --input_dir "./data" \
             --output_dir "./output" \
             --split 949,50,1 \
@@ -1245,6 +1259,7 @@ function llama_align_dygraph_dy2st_pir_auto_bs2_bf16_DP2-MP2-PP1-SP() {
                     --model_type "llama" \
                     --model_name_or_path "facebook/llama-7b" \
                     --tokenizer_name_or_path "facebook/llama-7b" \
+                    --hybrid_parallel_topo_order $DEFAULT_TOPO \
                     --input_dir "./data" \
                     --output_dir $case_out_dir \
                     --split 949,50,1 \
@@ -1361,6 +1376,7 @@ function llama_pir_auto_fuse_ffn_attention_qkv_MP2() {
                 run_pretrain_auto.py \
                 --model_name_or_path "facebook/llama-7b" \
                 --tokenizer_name_or_path "facebook/llama-7b" \
+                --hybrid_parallel_topo_order $DEFAULT_TOPO \
                 --input_dir "./data" \
                 --output_dir $auto_case_out_dir \
                 --split 949,50,1 \
@@ -1523,6 +1539,7 @@ function llama_align_dygraph_dy2st_pir_auto_bs2_bf16_DP2-MP2-PP2-SP() {
             --model_type "llama" \
             --model_name_or_path "facebook/llama-7b" \
             --tokenizer_name_or_path "facebook/llama-7b" \
+            --hybrid_parallel_topo_order $DEFAULT_TOPO \
             --input_dir "./data" \
             --output_dir $case_out_dir \
             --split 949,50,1 \
@@ -1623,6 +1640,7 @@ function llama_align_dygraph_dy2st_auto_bs2_bf16_DP2-MP1-PP1() {
             --model_type "llama" \
             --model_name_or_path "facebook/llama-7b" \
             --tokenizer_name_or_path "facebook/llama-7b" \
+            --hybrid_parallel_topo_order $DEFAULT_TOPO \
             --input_dir "./data" \
             --output_dir $case_out_dir \
             --split 949,50,1 \
@@ -1737,6 +1755,7 @@ function llama_dy2st_auto_bs2_bf16_DP2-MP1-PP1-CINN() {
         --model_type "llama" \
         --model_name_or_path "facebook/llama-7b" \
         --tokenizer_name_or_path "facebook/llama-7b" \
+        --hybrid_parallel_topo_order $DEFAULT_TOPO \
         --input_dir "./data" \
         --output_dir $case_out_dir \
         --split 949,50,1 \
@@ -1836,6 +1855,7 @@ function llama_dpo_dy2st_auto_bs2_bf16_MP8_intermediate() {
         --log_dir $case_log_dir \
         ../run_dpo_auto.py\
         --model_name_or_path "meta-llama/Meta-Llama-3.1-8B-Instruct" \
+        --hybrid_parallel_topo_order $DEFAULT_TOPO \
         --train_dataset_path ${llama_data_path}/data_dpo/data/train.jsonl \
         --dev_dataset_path ${llama_data_path}/data_dpo/data/dev.jsonl \
         --output_dir ./checkpoints/dpo_ckpts \
@@ -1926,6 +1946,7 @@ function llama_align_dygraph_dy2st_pir_auto_grad_merge_bs2_fp32_DP1-MP1-PP1() {
             --model_type "llama" \
             --model_name_or_path "facebook/llama-7b" \
             --tokenizer_name_or_path "facebook/llama-7b" \
+            --hybrid_parallel_topo_order $DEFAULT_TOPO \
             --input_dir "./data" \
             --output_dir $case_out_dir \
             --split 949,50,1 \
@@ -2033,6 +2054,7 @@ function llama_align_dy2st_fthenb_and_vpp_auto_bs2_fp32_DP1-MP1-PP4() {
             --model_type "llama" \
             --model_name_or_path "facebook/llama-7b" \
             --tokenizer_name_or_path "facebook/llama-7b" \
+            --hybrid_parallel_topo_order $DEFAULT_TOPO \
             --input_dir "./data" \
             --output_dir $case_out_dir \
             --split 949,50,1 \
@@ -2156,6 +2178,7 @@ function llama_align_dygraph_dy2st_pir_auto_pp_bs2_bf16_DP1-MP1-PP4() {
             --model_type "llama" \
             --model_name_or_path "facebook/llama-7b" \
             --tokenizer_name_or_path "facebook/llama-7b" \
+            --hybrid_parallel_topo_order $DEFAULT_TOPO \
             --input_dir "./data" \
             --output_dir $case_out_dir \
             --split 949,50,1 \
@@ -2248,6 +2271,7 @@ function llama_convert_hybrid_ckpt_to_auto_parallel_bs2_fp32_DP2-MP1-PP1() {
         ../../run_pretrain.py \
         --model_name_or_path "facebook/llama-7b" \
         --tokenizer_name_or_path "facebook/llama-7b" \
+        --hybrid_parallel_topo_order $DEFAULT_TOPO \
         --input_dir "./data" \
         --output_dir $dy_case_out_dir \
         --split 949,50,1 \
@@ -2321,6 +2345,7 @@ function llama_convert_hybrid_ckpt_to_auto_parallel_bs2_fp32_DP2-MP1-PP1() {
         run_pretrain_auto.py \
         --model_name_or_path "facebook/llama-7b" \
         --tokenizer_name_or_path "facebook/llama-7b" \
+        --hybrid_parallel_topo_order $DEFAULT_TOPO \
         --input_dir "./data" \
         --output_dir $auto_case_out_dir \
         --split 949,50,1 \
@@ -2403,6 +2428,7 @@ function llama_baichuan_pir_auto_fuse_ffn_attention_qkv_DP2_MP2_PP2(){
         --model_type "llama" \
         --model_name_or_path "baichuan-inc/Baichuan2-13B-Base" \
         --tokenizer_name_or_path "baichuan-inc/Baichuan2-13B-Base" \
+        --hybrid_parallel_topo_order $DEFAULT_TOPO \
         --input_dir "./data" \
         --output_dir $case_out_dir \
         --split 949,50,1 \
@@ -2475,6 +2501,7 @@ function llama_baichuan_pir_auto_fuse_ffn_attention_qkv_DP2_MP2_PP2_intermediate
         --use_intermediate_api true \
         --model_name_or_path "baichuan-inc/Baichuan2-13B-Base" \
         --tokenizer_name_or_path "baichuan-inc/Baichuan2-13B-Base" \
+        --hybrid_parallel_topo_order $DEFAULT_TOPO \
         --input_dir "./data" \
         --output_dir $case_out_dir \
         --split 949,50,1 \
@@ -2548,6 +2575,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2() {
         run_pretrain_auto.py \
         --model_name_or_path gpt2-medium-en \
         --tokenizer_name_or_path gpt2-medium-en \
+        --hybrid_parallel_topo_order $DEFAULT_TOPO \
         --input_dir "$gpt_data_path/data" \
         --output_dir "output/$task_name" \
         --split 949,50,1 \
@@ -2620,6 +2648,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2() {
         run_pretrain_auto.py \
         --model_name_or_path gpt2-medium-en \
         --tokenizer_name_or_path gpt2-medium-en \
+        --hybrid_parallel_topo_order $DEFAULT_TOPO \
         --input_dir "$gpt_data_path/data" \
         --output_dir $case_out_dir  \
         --split 949,50,1 \
@@ -2692,6 +2721,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2-PP2() {
         run_pretrain_auto.py \
         --model_name_or_path gpt2-medium-en \
         --tokenizer_name_or_path gpt2-medium-en \
+        --hybrid_parallel_topo_order $DEFAULT_TOPO \
         --input_dir "$gpt_data_path/data" \
         --output_dir $case_out_dir  \
         --split 949,50,1 \
@@ -2765,6 +2795,7 @@ function llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2() {
         run_pretrain_auto.py \
         --model_name_or_path gpt2-medium-en \
         --tokenizer_name_or_path gpt2-medium-en \
+        --hybrid_parallel_topo_order $DEFAULT_TOPO \
         --input_dir "$gpt_data_path/data" \
         --output_dir $case_out_dir  \
         --split 949,50,1 \
@@ -2838,6 +2869,7 @@ function llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2_intermediate() {
         run_pretrain_auto.py \
         --model_name_or_path gpt2-medium-en \
         --tokenizer_name_or_path gpt2-medium-en \
+        --hybrid_parallel_topo_order $DEFAULT_TOPO \
         --input_dir "$gpt_data_path/data" \
         --output_dir $case_out_dir  \
         --split 949,50,1 \
@@ -2911,6 +2943,7 @@ function llm_gpt_pir_auto_bs4_TP2(){
         run_pretrain_auto.py \
         --model_name_or_path gpt3-13B-en \
         --tokenizer_name_or_path gpt3-13B-en \
+        --hybrid_parallel_topo_order $DEFAULT_TOPO \
         --input_dir "$gpt_data_path/data" \
         --output_dir "output/$task_name" \
         --split 949,50,1 \
@@ -2978,6 +3011,7 @@ function llm_gpt_pir_auto_bs4_TP2_PP2(){
             run_pretrain_auto.py \
             --model_name_or_path gpt3-13B-en \
             --tokenizer_name_or_path gpt3-13B-en \
+            --hybrid_parallel_topo_order $DEFAULT_TOPO \
             --input_dir "$gpt_data_path/data" \
             --output_dir "output/$task_name" \
             --split 949,50,1 \
@@ -3041,6 +3075,7 @@ function llm_gpt_pir_auto_bs8_DP2_TP2_PP2(){
         run_pretrain_auto.py \
         --model_name_or_path gpt3-13B-en \
         --tokenizer_name_or_path gpt3-13B-en \
+        --hybrid_parallel_topo_order $DEFAULT_TOPO \
         --input_dir "$gpt_data_path/data" \
         --output_dir "output/$task_name" \
         --split 949,50,1 \
@@ -3107,6 +3142,7 @@ function llm_gpt_pir_auto_bs8_DP2_TP2_PP2_intermediate(){
         run_pretrain_auto.py \
         --model_name_or_path gpt3-13B-en \
         --tokenizer_name_or_path gpt3-13B-en \
+        --hybrid_parallel_topo_order $DEFAULT_TOPO \
         --input_dir "$gpt_data_path/data" \
         --output_dir "output/$task_name" \
         --split 949,50,1 \
@@ -3163,6 +3199,7 @@ function llm_qwen_dygraph_auto_bs1_fp32_DP2() {
 {
     "model_name_or_path": "qwen/qwen-7b",
     "tokenizer_name_or_path": "qwen/qwen-7b",
+    "hybrid_parallel_topo_order": \"$DEFAULT_TOPO\",
     "input_dir": "./data",
     "output_dir": "./checkpoints/qwen_pretrain_ckpts",
     "per_device_train_batch_size": 1,
@@ -3254,6 +3291,7 @@ function llm_qwen_dygraph_auto_bs1_fp32_DP2-MP2() {
 {
     "model_name_or_path": "qwen/qwen-7b",
     "tokenizer_name_or_path": "qwen/qwen-7b",
+    "hybrid_parallel_topo_order": \"$DEFAULT_TOPO\",
     "input_dir": "./data",
     "output_dir": "./checkpoints/qwen_pretrain_ckpts",
     "per_device_train_batch_size": 1,
@@ -3343,6 +3381,7 @@ function llm_qwen_dygraph_auto_bs1_fp32_DP2-MP2-PP2() {
 {
     "model_name_or_path": "qwen/qwen-7b",
     "tokenizer_name_or_path": "qwen/qwen-7b",
+    "hybrid_parallel_topo_order": \"$DEFAULT_TOPO\",
     "input_dir": "./data",
     "output_dir": "./checkpoints/qwen_pretrain_ckpts",
     "per_device_train_batch_size": 1,
@@ -3434,6 +3473,7 @@ function llm_qwen_dygraph_auto_bs1_bf16_DP2-MP2-PP2() {
 {
     "model_name_or_path": "qwen/qwen-7b",
     "tokenizer_name_or_path": "qwen/qwen-7b",
+    "hybrid_parallel_topo_order": \"$DEFAULT_TOPO\",
     "input_dir": "./data",
     "output_dir": "./checkpoints/qwen_pretrain_ckpts",
     "per_device_train_batch_size": 1,
@@ -3547,6 +3587,7 @@ function llm_qwen_pir_auto_bs1_bf16_TP2(){
         run_pretrain_auto.py \
         --model_name_or_path "qwen/qwen-14b" \
         --tokenizer_name_or_path "qwen/qwen-14b" \
+        --hybrid_parallel_topo_order $DEFAULT_TOPO \
         --input_dir "./data" \
         --output_dir "output/$task_name/" \
         --per_device_train_batch_size 1\
@@ -3624,6 +3665,7 @@ function llm_qwen_pir_auto_bs1_bf16_TP2_PP2(){
         run_pretrain_auto.py \
         --model_name_or_path "qwen/qwen-14b" \
         --tokenizer_name_or_path "qwen/qwen-14b" \
+        --hybrid_parallel_topo_order $DEFAULT_TOPO \
         --input_dir "./data" \
         --output_dir "output/$task_name/" \
         --per_device_train_batch_size 1\
@@ -3694,6 +3736,7 @@ function llama_lora_static_graph_auto_bs_2_bf16_DP2-TP2-PP1() {
         --log_dir  "$case_log_dir" \
         ../run_finetune_auto.py \
         --model_name_or_path "meta-llama/Meta-Llama-3.1-8B-Instruct" \
+        --hybrid_parallel_topo_order $DEFAULT_TOPO \
         --dataset_name_or_path "./data" \
         --output_dir "$case_out_dir" \
         --enable_auto_parallel true \
@@ -3853,6 +3896,7 @@ if [ $IS_A100 -eq 1 ]; then
     --model_type "deepseekv3_auto" \
     --model_name_or_path $model_config_json \
     --tokenizer_name_or_path "deepseek-ai/DeepSeek-V3" \
+    --hybrid_parallel_topo_order $DEFAULT_TOPO \
     --input_dir "./data" \
     --output_dir "output/$task_name" \
     --split 949,50,1 \
@@ -3999,6 +4043,7 @@ if [ $IS_A100 -eq 1 ]; then
     --model_type "deepseekv3_auto" \
     --model_name_or_path $model_config_json \
     --tokenizer_name_or_path "deepseek-ai/DeepSeek-V3" \
+    --hybrid_parallel_topo_order $DEFAULT_TOPO \
     --input_dir "./data" \
     --output_dir "output/$task_name" \
     --split 949,50,1 \
@@ -4075,6 +4120,7 @@ function llama_baichuan_dygraph_auto_sp_async_reduce_scatter_bs8_bf16_DP4-MP2-SP
 {
     "model_name_or_path": "baichuan-inc/Baichuan2-13B-Base",
     "tokenizer_name_or_path": "baichuan-inc/Baichuan2-13B-Base",
+    "hybrid_parallel_topo_order": \"$DEFAULT_TOPO\",
     "input_dir": "./data",
     "output_dir": "./checkpoints/baichuan2_13b_ckpts",
     "split": "949,50,1",

From adc6e1fef33c3034aa5ac215232cef0177e52fba Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 5 Sep 2025 22:52:03 +0800
Subject: [PATCH 8/8] fix JSON format error

---
 scripts/distribute/ci_case_auto.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh
index 91d9f89e849f..265644a17276 100755
--- a/scripts/distribute/ci_case_auto.sh
+++ b/scripts/distribute/ci_case_auto.sh
@@ -3199,7 +3199,7 @@ function llm_qwen_dygraph_auto_bs1_fp32_DP2() {
 {
     "model_name_or_path": "qwen/qwen-7b",
     "tokenizer_name_or_path": "qwen/qwen-7b",
-    "hybrid_parallel_topo_order": \"$DEFAULT_TOPO\",
+    "hybrid_parallel_topo_order": "$DEFAULT_TOPO",
     "input_dir": "./data",
     "output_dir": "./checkpoints/qwen_pretrain_ckpts",
     "per_device_train_batch_size": 1,
@@ -3291,7 +3291,7 @@ function llm_qwen_dygraph_auto_bs1_fp32_DP2-MP2() {
 {
     "model_name_or_path": "qwen/qwen-7b",
     "tokenizer_name_or_path": "qwen/qwen-7b",
-    "hybrid_parallel_topo_order": \"$DEFAULT_TOPO\",
+    "hybrid_parallel_topo_order": "$DEFAULT_TOPO",
     "input_dir": "./data",
     "output_dir": "./checkpoints/qwen_pretrain_ckpts",
     "per_device_train_batch_size": 1,
@@ -3381,7 +3381,7 @@ function llm_qwen_dygraph_auto_bs1_fp32_DP2-MP2-PP2() {
 {
     "model_name_or_path": "qwen/qwen-7b",
     "tokenizer_name_or_path": "qwen/qwen-7b",
-    "hybrid_parallel_topo_order": \"$DEFAULT_TOPO\",
+    "hybrid_parallel_topo_order": "$DEFAULT_TOPO",
     "input_dir": "./data",
     "output_dir": "./checkpoints/qwen_pretrain_ckpts",
     "per_device_train_batch_size": 1,
@@ -3473,7 +3473,7 @@ function llm_qwen_dygraph_auto_bs1_bf16_DP2-MP2-PP2() {
 {
     "model_name_or_path": "qwen/qwen-7b",
     "tokenizer_name_or_path": "qwen/qwen-7b",
-    "hybrid_parallel_topo_order": \"$DEFAULT_TOPO\",
+    "hybrid_parallel_topo_order": "$DEFAULT_TOPO",
     "input_dir": "./data",
     "output_dir": "./checkpoints/qwen_pretrain_ckpts",
     "per_device_train_batch_size": 1,
@@ -4120,7 +4120,7 @@ function llama_baichuan_dygraph_auto_sp_async_reduce_scatter_bs8_bf16_DP4-MP2-SP
 {
     "model_name_or_path": "baichuan-inc/Baichuan2-13B-Base",
     "tokenizer_name_or_path": "baichuan-inc/Baichuan2-13B-Base",
-    "hybrid_parallel_topo_order": \"$DEFAULT_TOPO\",
+    "hybrid_parallel_topo_order": "$DEFAULT_TOPO",
     "input_dir": "./data",
     "output_dir": "./checkpoints/baichuan2_13b_ckpts",
     "split": "949,50,1",