@@ -25,6 +25,8 @@ export llama_data_path=/llama_data
25
25
export llm_gpt_case_path=$root_path /llm/auto_parallel/gpt-3
26
26
export gpt_data_path=/fleetx_data
27
27
28
+ DEFAULT_TOPO=pp_first
29
+
28
30
unset CUDA_VISIBLE_DEVICES
29
31
30
32
function is_a100() {
@@ -256,6 +258,7 @@ function llama_dygraph_auto_bs4_bf16_SD2() {
256
258
./run_pretrain_auto.py \
257
259
--model_name_or_path " meta-llama/Llama-2-7b" \
258
260
--tokenizer_name_or_path " meta-llama/Llama-2-7b" \
261
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
259
262
--input_dir " ./data" \
260
263
--output_dir " ./output" \
261
264
--weight_decay 0.01 \
@@ -358,6 +361,7 @@ function llama_dygraph_auto_bs8_fp32_DP2() {
358
361
--model_type " llama" \
359
362
--model_name_or_path " facebook/llama-7b" \
360
363
--tokenizer_name_or_path " facebook/llama-7b" \
364
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
361
365
--input_dir " ./data" \
362
366
--output_dir $case_out_dir \
363
367
--split 949,50,1 \
@@ -429,6 +433,7 @@ function llama_dygraph_auto_bs8_fp32_DP2-MP2() {
429
433
--model_type " llama" \
430
434
--model_name_or_path " facebook/llama-7b" \
431
435
--tokenizer_name_or_path " facebook/llama-7b" \
436
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
432
437
--input_dir " ./data" \
433
438
--output_dir $case_out_dir \
434
439
--split 949,50,1 \
@@ -511,6 +516,7 @@ function llama_dygraph_auto_bs8_fp32_DP2-MP2-PP2() {
511
516
--model_type " llama" \
512
517
--model_name_or_path " facebook/llama-7b" \
513
518
--tokenizer_name_or_path " facebook/llama-7b" \
519
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
514
520
--input_dir " ./data" \
515
521
--output_dir $case_out_dir \
516
522
--split 949,50,1 \
@@ -584,6 +590,7 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2() {
584
590
--model_type " llama" \
585
591
--model_name_or_path " facebook/llama-7b" \
586
592
--tokenizer_name_or_path " facebook/llama-7b" \
593
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
587
594
--input_dir " ./data" \
588
595
--output_dir $case_out_dir \
589
596
--split 949,50,1 \
@@ -659,6 +666,7 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2_intermediate() {
659
666
--use_intermediate_api 1\
660
667
--model_name_or_path " facebook/llama-7b" \
661
668
--tokenizer_name_or_path " facebook/llama-7b" \
669
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
662
670
--input_dir " ./data" \
663
671
--output_dir $case_out_dir \
664
672
--split 949,50,1 \
@@ -732,6 +740,7 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-CP2() {
732
740
--model_type " llama" \
733
741
--model_name_or_path " facebook/llama-7b" \
734
742
--tokenizer_name_or_path " facebook/llama-7b" \
743
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
735
744
--input_dir " ./data" \
736
745
--output_dir $case_out_dir \
737
746
--split 949,50,1 \
@@ -806,6 +815,7 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-CP2_intermediate() {
806
815
python -u -m paddle.distributed.launch --gpus " 0,1,2,3,4,5,6,7" --log_dir $case_log_dir run_pretrain_auto.py \
807
816
--model_name_or_path " facebook/llama-7b" \
808
817
--tokenizer_name_or_path " facebook/llama-7b" \
818
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
809
819
--input_dir " ./data" \
810
820
--output_dir $case_out_dir \
811
821
--split 949,50,1 \
@@ -883,6 +893,7 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2_hybrid_pp() {
883
893
--model_type " llama_pp" \
884
894
--model_name_or_path " facebook/llama-7b" \
885
895
--tokenizer_name_or_path " facebook/llama-7b" \
896
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
886
897
--input_dir " ./data" \
887
898
--output_dir $case_out_dir \
888
899
--split 949,50,1 \
@@ -950,6 +961,7 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2_hybrid_pp() {
950
961
--model_type " llama_pp" \
951
962
--model_name_or_path " facebook/llama-7b" \
952
963
--tokenizer_name_or_path " facebook/llama-7b" \
964
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
953
965
--input_dir " ./data" \
954
966
--output_dir $auto_case_out_dir \
955
967
--split 949,50,1 \
@@ -1031,6 +1043,7 @@ function llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2() {
1031
1043
./run_pretrain_auto.py \
1032
1044
--model_name_or_path " meta-llama/Llama-2-13b" \
1033
1045
--tokenizer_name_or_path " meta-llama/Llama-2-13b" \
1046
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
1034
1047
--input_dir " ./data" \
1035
1048
--output_dir " ./output" \
1036
1049
--split 949,50,1 \
@@ -1131,6 +1144,7 @@ function llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2-VPP3_split_bw() {
1131
1144
./run_pretrain_auto.py \
1132
1145
--model_name_or_path " meta-llama/Llama-2-13b" \
1133
1146
--tokenizer_name_or_path " meta-llama/Llama-2-13b" \
1147
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
1134
1148
--input_dir " ./data" \
1135
1149
--output_dir " ./output" \
1136
1150
--split 949,50,1 \
@@ -1245,6 +1259,7 @@ function llama_align_dygraph_dy2st_pir_auto_bs2_bf16_DP2-MP2-PP1-SP() {
1245
1259
--model_type " llama" \
1246
1260
--model_name_or_path " facebook/llama-7b" \
1247
1261
--tokenizer_name_or_path " facebook/llama-7b" \
1262
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
1248
1263
--input_dir " ./data" \
1249
1264
--output_dir $case_out_dir \
1250
1265
--split 949,50,1 \
@@ -1361,6 +1376,7 @@ function llama_pir_auto_fuse_ffn_attention_qkv_MP2() {
1361
1376
run_pretrain_auto.py \
1362
1377
--model_name_or_path " facebook/llama-7b" \
1363
1378
--tokenizer_name_or_path " facebook/llama-7b" \
1379
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
1364
1380
--input_dir " ./data" \
1365
1381
--output_dir $auto_case_out_dir \
1366
1382
--split 949,50,1 \
@@ -1523,6 +1539,7 @@ function llama_align_dygraph_dy2st_pir_auto_bs2_bf16_DP2-MP2-PP2-SP() {
1523
1539
--model_type " llama" \
1524
1540
--model_name_or_path " facebook/llama-7b" \
1525
1541
--tokenizer_name_or_path " facebook/llama-7b" \
1542
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
1526
1543
--input_dir " ./data" \
1527
1544
--output_dir $case_out_dir \
1528
1545
--split 949,50,1 \
@@ -1623,6 +1640,7 @@ function llama_align_dygraph_dy2st_auto_bs2_bf16_DP2-MP1-PP1() {
1623
1640
--model_type " llama" \
1624
1641
--model_name_or_path " facebook/llama-7b" \
1625
1642
--tokenizer_name_or_path " facebook/llama-7b" \
1643
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
1626
1644
--input_dir " ./data" \
1627
1645
--output_dir $case_out_dir \
1628
1646
--split 949,50,1 \
@@ -1737,6 +1755,7 @@ function llama_dy2st_auto_bs2_bf16_DP2-MP1-PP1-CINN() {
1737
1755
--model_type " llama" \
1738
1756
--model_name_or_path " facebook/llama-7b" \
1739
1757
--tokenizer_name_or_path " facebook/llama-7b" \
1758
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
1740
1759
--input_dir " ./data" \
1741
1760
--output_dir $case_out_dir \
1742
1761
--split 949,50,1 \
@@ -1836,6 +1855,7 @@ function llama_dpo_dy2st_auto_bs2_bf16_MP8_intermediate() {
1836
1855
--log_dir $case_log_dir \
1837
1856
../run_dpo_auto.py\
1838
1857
--model_name_or_path " meta-llama/Meta-Llama-3.1-8B-Instruct" \
1858
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
1839
1859
--train_dataset_path ${llama_data_path} /data_dpo/data/train.jsonl \
1840
1860
--dev_dataset_path ${llama_data_path} /data_dpo/data/dev.jsonl \
1841
1861
--output_dir ./checkpoints/dpo_ckpts \
@@ -1926,6 +1946,7 @@ function llama_align_dygraph_dy2st_pir_auto_grad_merge_bs2_fp32_DP1-MP1-PP1() {
1926
1946
--model_type " llama" \
1927
1947
--model_name_or_path " facebook/llama-7b" \
1928
1948
--tokenizer_name_or_path " facebook/llama-7b" \
1949
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
1929
1950
--input_dir " ./data" \
1930
1951
--output_dir $case_out_dir \
1931
1952
--split 949,50,1 \
@@ -2033,6 +2054,7 @@ function llama_align_dy2st_fthenb_and_vpp_auto_bs2_fp32_DP1-MP1-PP4() {
2033
2054
--model_type " llama" \
2034
2055
--model_name_or_path " facebook/llama-7b" \
2035
2056
--tokenizer_name_or_path " facebook/llama-7b" \
2057
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
2036
2058
--input_dir " ./data" \
2037
2059
--output_dir $case_out_dir \
2038
2060
--split 949,50,1 \
@@ -2156,6 +2178,7 @@ function llama_align_dygraph_dy2st_pir_auto_pp_bs2_bf16_DP1-MP1-PP4() {
2156
2178
--model_type " llama" \
2157
2179
--model_name_or_path " facebook/llama-7b" \
2158
2180
--tokenizer_name_or_path " facebook/llama-7b" \
2181
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
2159
2182
--input_dir " ./data" \
2160
2183
--output_dir $case_out_dir \
2161
2184
--split 949,50,1 \
@@ -2248,6 +2271,7 @@ function llama_convert_hybrid_ckpt_to_auto_parallel_bs2_fp32_DP2-MP1-PP1() {
2248
2271
../../run_pretrain.py \
2249
2272
--model_name_or_path " facebook/llama-7b" \
2250
2273
--tokenizer_name_or_path " facebook/llama-7b" \
2274
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
2251
2275
--input_dir " ./data" \
2252
2276
--output_dir $dy_case_out_dir \
2253
2277
--split 949,50,1 \
@@ -2321,6 +2345,7 @@ function llama_convert_hybrid_ckpt_to_auto_parallel_bs2_fp32_DP2-MP1-PP1() {
2321
2345
run_pretrain_auto.py \
2322
2346
--model_name_or_path " facebook/llama-7b" \
2323
2347
--tokenizer_name_or_path " facebook/llama-7b" \
2348
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
2324
2349
--input_dir " ./data" \
2325
2350
--output_dir $auto_case_out_dir \
2326
2351
--split 949,50,1 \
@@ -2403,6 +2428,7 @@ function llama_baichuan_pir_auto_fuse_ffn_attention_qkv_DP2_MP2_PP2(){
2403
2428
--model_type " llama" \
2404
2429
--model_name_or_path " baichuan-inc/Baichuan2-13B-Base" \
2405
2430
--tokenizer_name_or_path " baichuan-inc/Baichuan2-13B-Base" \
2431
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
2406
2432
--input_dir " ./data" \
2407
2433
--output_dir $case_out_dir \
2408
2434
--split 949,50,1 \
@@ -2475,6 +2501,7 @@ function llama_baichuan_pir_auto_fuse_ffn_attention_qkv_DP2_MP2_PP2_intermediate
2475
2501
--use_intermediate_api true \
2476
2502
--model_name_or_path " baichuan-inc/Baichuan2-13B-Base" \
2477
2503
--tokenizer_name_or_path " baichuan-inc/Baichuan2-13B-Base" \
2504
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
2478
2505
--input_dir " ./data" \
2479
2506
--output_dir $case_out_dir \
2480
2507
--split 949,50,1 \
@@ -2548,6 +2575,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2() {
2548
2575
run_pretrain_auto.py \
2549
2576
--model_name_or_path gpt2-medium-en \
2550
2577
--tokenizer_name_or_path gpt2-medium-en \
2578
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
2551
2579
--input_dir " $gpt_data_path /data" \
2552
2580
--output_dir " output/$task_name " \
2553
2581
--split 949,50,1 \
@@ -2620,6 +2648,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2() {
2620
2648
run_pretrain_auto.py \
2621
2649
--model_name_or_path gpt2-medium-en \
2622
2650
--tokenizer_name_or_path gpt2-medium-en \
2651
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
2623
2652
--input_dir " $gpt_data_path /data" \
2624
2653
--output_dir $case_out_dir \
2625
2654
--split 949,50,1 \
@@ -2692,6 +2721,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2-PP2() {
2692
2721
run_pretrain_auto.py \
2693
2722
--model_name_or_path gpt2-medium-en \
2694
2723
--tokenizer_name_or_path gpt2-medium-en \
2724
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
2695
2725
--input_dir " $gpt_data_path /data" \
2696
2726
--output_dir $case_out_dir \
2697
2727
--split 949,50,1 \
@@ -2765,6 +2795,7 @@ function llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2() {
2765
2795
run_pretrain_auto.py \
2766
2796
--model_name_or_path gpt2-medium-en \
2767
2797
--tokenizer_name_or_path gpt2-medium-en \
2798
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
2768
2799
--input_dir " $gpt_data_path /data" \
2769
2800
--output_dir $case_out_dir \
2770
2801
--split 949,50,1 \
@@ -2838,6 +2869,7 @@ function llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2_intermediate() {
2838
2869
run_pretrain_auto.py \
2839
2870
--model_name_or_path gpt2-medium-en \
2840
2871
--tokenizer_name_or_path gpt2-medium-en \
2872
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
2841
2873
--input_dir " $gpt_data_path /data" \
2842
2874
--output_dir $case_out_dir \
2843
2875
--split 949,50,1 \
@@ -2911,6 +2943,7 @@ function llm_gpt_pir_auto_bs4_TP2(){
2911
2943
run_pretrain_auto.py \
2912
2944
--model_name_or_path gpt3-13B-en \
2913
2945
--tokenizer_name_or_path gpt3-13B-en \
2946
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
2914
2947
--input_dir " $gpt_data_path /data" \
2915
2948
--output_dir " output/$task_name " \
2916
2949
--split 949,50,1 \
@@ -2978,6 +3011,7 @@ function llm_gpt_pir_auto_bs4_TP2_PP2(){
2978
3011
run_pretrain_auto.py \
2979
3012
--model_name_or_path gpt3-13B-en \
2980
3013
--tokenizer_name_or_path gpt3-13B-en \
3014
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
2981
3015
--input_dir " $gpt_data_path /data" \
2982
3016
--output_dir " output/$task_name " \
2983
3017
--split 949,50,1 \
@@ -3041,6 +3075,7 @@ function llm_gpt_pir_auto_bs8_DP2_TP2_PP2(){
3041
3075
run_pretrain_auto.py \
3042
3076
--model_name_or_path gpt3-13B-en \
3043
3077
--tokenizer_name_or_path gpt3-13B-en \
3078
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
3044
3079
--input_dir " $gpt_data_path /data" \
3045
3080
--output_dir " output/$task_name " \
3046
3081
--split 949,50,1 \
@@ -3107,6 +3142,7 @@ function llm_gpt_pir_auto_bs8_DP2_TP2_PP2_intermediate(){
3107
3142
run_pretrain_auto.py \
3108
3143
--model_name_or_path gpt3-13B-en \
3109
3144
--tokenizer_name_or_path gpt3-13B-en \
3145
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
3110
3146
--input_dir " $gpt_data_path /data" \
3111
3147
--output_dir " output/$task_name " \
3112
3148
--split 949,50,1 \
@@ -3163,6 +3199,7 @@ function llm_qwen_dygraph_auto_bs1_fp32_DP2() {
3163
3199
{
3164
3200
"model_name_or_path": "qwen/qwen-7b",
3165
3201
"tokenizer_name_or_path": "qwen/qwen-7b",
3202
+ "hybrid_parallel_topo_order": "$DEFAULT_TOPO ",
3166
3203
"input_dir": "./data",
3167
3204
"output_dir": "./checkpoints/qwen_pretrain_ckpts",
3168
3205
"per_device_train_batch_size": 1,
@@ -3254,6 +3291,7 @@ function llm_qwen_dygraph_auto_bs1_fp32_DP2-MP2() {
3254
3291
{
3255
3292
"model_name_or_path": "qwen/qwen-7b",
3256
3293
"tokenizer_name_or_path": "qwen/qwen-7b",
3294
+ "hybrid_parallel_topo_order": "$DEFAULT_TOPO ",
3257
3295
"input_dir": "./data",
3258
3296
"output_dir": "./checkpoints/qwen_pretrain_ckpts",
3259
3297
"per_device_train_batch_size": 1,
@@ -3343,6 +3381,7 @@ function llm_qwen_dygraph_auto_bs1_fp32_DP2-MP2-PP2() {
3343
3381
{
3344
3382
"model_name_or_path": "qwen/qwen-7b",
3345
3383
"tokenizer_name_or_path": "qwen/qwen-7b",
3384
+ "hybrid_parallel_topo_order": "$DEFAULT_TOPO ",
3346
3385
"input_dir": "./data",
3347
3386
"output_dir": "./checkpoints/qwen_pretrain_ckpts",
3348
3387
"per_device_train_batch_size": 1,
@@ -3434,6 +3473,7 @@ function llm_qwen_dygraph_auto_bs1_bf16_DP2-MP2-PP2() {
3434
3473
{
3435
3474
"model_name_or_path": "qwen/qwen-7b",
3436
3475
"tokenizer_name_or_path": "qwen/qwen-7b",
3476
+ "hybrid_parallel_topo_order": "$DEFAULT_TOPO ",
3437
3477
"input_dir": "./data",
3438
3478
"output_dir": "./checkpoints/qwen_pretrain_ckpts",
3439
3479
"per_device_train_batch_size": 1,
@@ -3547,6 +3587,7 @@ function llm_qwen_pir_auto_bs1_bf16_TP2(){
3547
3587
run_pretrain_auto.py \
3548
3588
--model_name_or_path " qwen/qwen-14b" \
3549
3589
--tokenizer_name_or_path " qwen/qwen-14b" \
3590
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
3550
3591
--input_dir " ./data" \
3551
3592
--output_dir " output/$task_name /" \
3552
3593
--per_device_train_batch_size 1\
@@ -3624,6 +3665,7 @@ function llm_qwen_pir_auto_bs1_bf16_TP2_PP2(){
3624
3665
run_pretrain_auto.py \
3625
3666
--model_name_or_path " qwen/qwen-14b" \
3626
3667
--tokenizer_name_or_path " qwen/qwen-14b" \
3668
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
3627
3669
--input_dir " ./data" \
3628
3670
--output_dir " output/$task_name /" \
3629
3671
--per_device_train_batch_size 1\
@@ -3694,6 +3736,7 @@ function llama_lora_static_graph_auto_bs_2_bf16_DP2-TP2-PP1() {
3694
3736
--log_dir " $case_log_dir " \
3695
3737
../run_finetune_auto.py \
3696
3738
--model_name_or_path " meta-llama/Meta-Llama-3.1-8B-Instruct" \
3739
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
3697
3740
--dataset_name_or_path " ./data" \
3698
3741
--output_dir " $case_out_dir " \
3699
3742
--enable_auto_parallel true \
@@ -3853,6 +3896,7 @@ if [ $IS_A100 -eq 1 ]; then
3853
3896
--model_type " deepseekv3_auto" \
3854
3897
--model_name_or_path $model_config_json \
3855
3898
--tokenizer_name_or_path " deepseek-ai/DeepSeek-V3" \
3899
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
3856
3900
--input_dir " ./data" \
3857
3901
--output_dir " output/$task_name " \
3858
3902
--split 949,50,1 \
@@ -3999,6 +4043,7 @@ if [ $IS_A100 -eq 1 ]; then
3999
4043
--model_type " deepseekv3_auto" \
4000
4044
--model_name_or_path $model_config_json \
4001
4045
--tokenizer_name_or_path " deepseek-ai/DeepSeek-V3" \
4046
+ --hybrid_parallel_topo_order $DEFAULT_TOPO \
4002
4047
--input_dir " ./data" \
4003
4048
--output_dir " output/$task_name " \
4004
4049
--split 949,50,1 \
@@ -4075,6 +4120,7 @@ function llama_baichuan_dygraph_auto_sp_async_reduce_scatter_bs8_bf16_DP4-MP2-SP
4075
4120
{
4076
4121
"model_name_or_path": "baichuan-inc/Baichuan2-13B-Base",
4077
4122
"tokenizer_name_or_path": "baichuan-inc/Baichuan2-13B-Base",
4123
+ "hybrid_parallel_topo_order": "$DEFAULT_TOPO ",
4078
4124
"input_dir": "./data",
4079
4125
"output_dir": "./checkpoints/baichuan2_13b_ckpts",
4080
4126
"split": "949,50,1",
0 commit comments