Skip to content

Commit 13eb7f7

Browse files
authored
Merge branch 'PaddlePaddle:develop' into add_sharded_state_dict
2 parents 9805b27 + c83684f commit 13eb7f7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+1291
-631
lines changed

.github/workflows/lint.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,6 @@ jobs:
7272
else
7373
echo "local develop branch exist, skipping"
7474
fi
75-
76-
unset http_proxy && unset https_proxy
7775
'
7876
7977
- name: Setup Environment
@@ -84,6 +82,7 @@ jobs:
8482
set -e
8583
python -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
8684
python -m pip config set global.cache-dir "/home/.cache/pip"
85+
source $work_dir/../../../proxy
8786
python -m pip install --upgrade pip
8887
cd /workspace/PaddleNLP && git config --global --add safe.directory $PWD
8988
make install
@@ -93,6 +92,7 @@ jobs:
9392
run: |
9493
docker exec -t $container_name /bin/bash -c '
9594
set -e
95+
source $work_dir/../../../proxy
9696
cd /workspace/PaddleNLP
9797
make lint
9898
'

.github/workflows/unittest-cpu.yml

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,11 @@ env:
1111
COMMIT_ID: ${{ github.event.pull_request.head.sha }}
1212
BRANCH: ${{ github.event.pull_request.base.ref }}
1313
TASK: PaddleNLP-CI-${{ github.event.pull_request.number }}-unittest-cpu
14+
HF_ENDPOINT: https://hf-mirror.com
15+
STUDIO_GIT_HOST: http://git.prod.idc-to-cloud.aistudio.baidu-int.com
16+
PPNLP_HOME: /home/disk1/cache
17+
HF_DATASETS_CACHE: /home/disk1/cache/huggingface/datasets
18+
TRANSFORMERS_CACHE: /home/disk1/cache/huggingface
1419

1520
jobs:
1621
Test:
@@ -33,12 +38,18 @@ jobs:
3338
-v $work_dir/../../..:$work_dir/../../.. \
3439
-v $work_dir:/workspace \
3540
-v /home/.cache/pip:/home/.cache/pip \
41+
-v /home/disk1/cache:/home/disk1/cache \
3642
-e BRANCH \
3743
-e PR_ID \
3844
-e COMMIT_ID \
3945
-e work_dir \
4046
-e no_proxy \
4147
-e python_version \
48+
-e HF_ENDPOINT \
49+
-e STUDIO_GIT_HOST \
50+
-e PPNLP_HOME \
51+
-e HF_DATASETS_CACHE \
52+
-e TRANSFORMERS_CACHE \
4253
-w /workspace ${docker_image}
4354
4455
- name: Download Code
@@ -68,7 +79,6 @@ jobs:
6879
echo "Not in a pull_request event. Skipping PR-specific operations."
6980
fi
7081
git log --pretty=oneline -10
71-
unset http_proxy && unset https_proxy
7282
'
7383
7484
- name: Setup Environment
@@ -79,6 +89,7 @@ jobs:
7989
set -e
8090
python -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
8191
python -m pip config set global.cache-dir "/home/.cache/pip"
92+
source $work_dir/../../../proxy
8293
python -m pip install --upgrade pip
8394
cd /workspace/PaddleNLP && git config --global --add safe.directory $PWD
8495
pip install -r tests/requirements.txt

llm/alignment/rl/gsm8k_processor.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# Copyright 2024 Bytedance Ltd. and/or its affiliates
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""
15+
Preprocess the GSM8k dataset to parquet format
16+
"""
17+
18+
import argparse
19+
import os
20+
import re
21+
22+
import datasets
23+
24+
25+
def extract_solution(solution_str):
26+
solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str)
27+
assert solution is not None
28+
final_solution = solution.group(0)
29+
final_solution = final_solution.split("#### ")[1].replace(",", "")
30+
return final_solution
31+
32+
33+
if __name__ == "__main__":
34+
parser = argparse.ArgumentParser()
35+
parser.add_argument("--local_dir", default="./gsm8k")
36+
37+
args = parser.parse_args()
38+
39+
data_source = "openai/gsm8k"
40+
41+
dataset = datasets.load_dataset(data_source, "main")
42+
43+
train_dataset = dataset["train"]
44+
test_dataset = dataset["test"]
45+
46+
instruction_following = 'Let\'s think step by step and output the final answer after "####".'
47+
48+
# add a row to each data item that represents a unique id
49+
def make_map_fn(split):
50+
def process_fn(example, idx):
51+
question_raw = "<|im_start|>user\n" + example.pop("question")
52+
53+
system_raw = (
54+
"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n"
55+
)
56+
question = system_raw + question_raw + " " + instruction_following + "<|im_end|>\n<|im_start|>assistant\n"
57+
58+
answer_raw = example.pop("answer")
59+
solution = extract_solution(answer_raw)
60+
data = {
61+
"src": question,
62+
"tgt": solution,
63+
}
64+
return data
65+
66+
return process_fn
67+
68+
train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
69+
test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
70+
71+
local_dir = args.local_dir
72+
73+
train_dataset.to_json(os.path.join(local_dir, "train.jsonl"), orient="records", lines=True)
74+
test_dataset.to_json(os.path.join(local_dir, "test.jsonl"), orient="records", lines=True)

llm/alignment/rl/run_rl.py

Lines changed: 47 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
from paddlenlp.transformers import (
4343
AutoConfig,
4444
AutoModelForCausalLM,
45+
AutoModelForTokenClassification,
4546
AutoTokenizer,
4647
PretrainedConfig,
4748
)
@@ -134,7 +135,6 @@ def create_actor_models(
134135
)
135136
if not training_args.autotuner_benchmark:
136137
reference_model.set_state_dict(actor_model.state_dict())
137-
138138
actor_tokenizer = AutoTokenizer.from_pretrained(
139139
model_args.actor_model_name_or_path,
140140
model_max_length=data_args.max_length,
@@ -210,46 +210,43 @@ def create_critic_models(
210210
data_args: DataArgument,
211211
training_args: TrainingArguments,
212212
common_config: Dict,
213-
reward_model,
214213
):
215214
with timers_scope_runtimer("Critic model loading time"):
216-
reward_model_config = reward_model.config
217-
if model_args.critic_model_name_or_path is None:
218-
model_args.critic_model_name_or_path = model_args.reward_model_name_or_path
219-
critic_model = AutoModelForScore.from_config(
220-
reward_model_config,
221-
dtype=training_args.model_dtype,
222-
score_type="critic",
223-
do_normalize=False,
224-
clip_range_value=training_args.clip_range_value,
225-
**common_config,
215+
critic_model_config = AutoConfig.from_pretrained(
216+
model_args.critic_model_name_or_path,
217+
tensor_parallel_output=training_args.tensor_parallel_output,
218+
tensor_parallel_degree=training_args.tensor_parallel_degree,
219+
tensor_parallel_rank=training_args.tensor_parallel_rank,
220+
dtype=training_args.model_dtype,
221+
recompute=training_args.critic_recompute,
222+
recompute_granularity=model_args.critic_recompute_granularity,
223+
recompute_use_reentrant=training_args.recompute_use_reentrant,
224+
**common_config,
225+
)
226+
LlmMetaConfig.set_llm_config(critic_model_config, training_args)
227+
228+
critic_model_config.max_position_embeddings = data_args.max_length
229+
critic_model_config.use_sparse_head_and_loss_fn = False
230+
critic_model_config.num_labels = 1
231+
critic_model_config.classifier_dropout = 0.0
232+
critic_model_config.hidden_dropout = 0.0
233+
logger.info(f"Loading Critic model with config:\n\t{critic_model_config}\n")
234+
235+
if not training_args.autotuner_benchmark:
236+
critic_model = AutoModelForTokenClassification.from_pretrained(
237+
model_args.critic_model_name_or_path,
238+
config=critic_model_config,
226239
)
227-
if not training_args.autotuner_benchmark:
228-
critic_model.set_state_dict(reward_model.state_dict())
229240
else:
230-
if not training_args.autotuner_benchmark:
231-
critic_model = AutoModelForScore.from_pretrained(
232-
model_args.critic_model_name_or_path,
233-
config=reward_model_config,
234-
score_type="critic",
235-
do_normalize=False,
236-
clip_range_value=training_args.clip_range_value,
237-
**common_config,
238-
)
239-
else:
240-
critic_model = AutoModelForScore.from_config(
241-
reward_model_config,
242-
score_type="critic",
243-
do_normalize=False,
244-
clip_range_value=training_args.clip_range_value,
245-
**common_config,
246-
)
241+
critic_model = AutoModelForTokenClassification.from_config(
242+
critic_model_config,
243+
)
247244

248245
critic_tokenizer = AutoTokenizer.from_pretrained(
249246
model_args.critic_model_name_or_path,
250247
model_max_length=data_args.max_length,
251248
padding_side="left",
252-
tokenizer_alpha=model_args.reward_critic_tokenizer_alpha,
249+
tokenizer_alpha=model_args.critic_tokenizer_alpha,
253250
use_fast=True,
254251
)
255252
if critic_tokenizer.pad_token_id is None:
@@ -261,16 +258,16 @@ def create_critic_models(
261258
if training_args.eval_mode == "single":
262259
config.tensor_parallel_degree = -1
263260
config.tensor_parallel_rank = 0
264-
with timers_scope_runtimer("Reward critic eval model loading time"):
265-
critic_eval_model = AutoModelForScore.from_config(config)
261+
with timers_scope_runtimer("Critic eval model loading time"):
262+
critic_eval_model = AutoModelForTokenClassification.from_config(config)
266263
else:
267264
critic_eval_model = None
268265

269266
return critic_model, critic_eval_model, critic_tokenizer
270267

271268

272269
def create_rl_dataset(data_args, training_args, tokenizer):
273-
requires_label = True if training_args.use_rm_server else False
270+
requires_label = True if training_args.use_rm_server or training_args.use_rule_reward else False
274271
train_ds = RLHFDataset(
275272
dataset_name_or_path=data_args.train_datasets,
276273
tokenizer=tokenizer,
@@ -333,15 +330,16 @@ def main():
333330
actor_model, actor_eval_model, reference_model, actor_tokenizer = create_actor_models(
334331
model_args, data_args, training_args, common_config, reshard_controller
335332
)
336-
337-
if not training_args.use_rm_server and model_args.reward_model_name_or_path is not None:
333+
if training_args.use_rule_reward:
334+
reward_model, reward_tokenizer = None, actor_tokenizer
335+
elif not training_args.use_rm_server and model_args.reward_model_name_or_path is not None:
338336
reward_model, reward_tokenizer = create_reward_models(model_args, data_args, training_args, common_config)
339337
else:
340338
reward_model, reward_tokenizer = model_args.reward_server, actor_tokenizer
341339

342340
if training_args.rl_algorithm == "ppo":
343341
critic_model, critic_eval_model, critic_tokenizer = create_critic_models(
344-
model_args, data_args, training_args, common_config, reward_model
342+
model_args, data_args, training_args, common_config
345343
)
346344
else:
347345
critic_model, critic_eval_model, critic_tokenizer = None, None, None
@@ -355,15 +353,23 @@ def main():
355353
offload_tensor_to_cpu((reference_model, "freeze_model"))
356354

357355
if training_args.rl_algorithm == "ppo":
358-
offload_tensor_to_cpu((reward_model, "freeze_model"))
356+
if not training_args.use_rm_server and not training_args.use_rule_reward:
357+
offload_tensor_to_cpu((reward_model, "freeze_model"))
359358
if critic_eval_model is not None:
360359
offload_tensor_to_cpu((critic_eval_model, "freeze_model"))
361360

362361
# NOTE(gongenlei): release memory_reserved_size to equal to memory_allocated_size
363362
paddle.device.cuda.empty_cache()
364363

365364
def compute_metrics(eval_preds):
366-
accuracy = (eval_preds.predictions == 3).astype("float32").mean().item()
365+
'''
366+
If "use_rm_server" is TRUE, the score ranges from -3 to 3, with 3 being the only correct score (format + result).
367+
If using the "Regularized Matching Function (use_rule_reward=True)" (currently only implemented for the gsm8k dataset), the score ranges from 0 to 1.
368+
'''
369+
if training_args.use_rule_reward:
370+
accuracy = (eval_preds.predictions == 1).astype("float32").mean().item()
371+
else:
372+
accuracy = (eval_preds.predictions == 3).astype("float32").mean().item()
367373
return {"accuracy": accuracy}
368374

369375
try:
@@ -389,7 +395,7 @@ def compute_metrics(eval_preds):
389395
data_collator=partial(
390396
collate_fn,
391397
pad_token_id=actor_tokenizer.pad_token_id,
392-
requires_label=True if training_args.use_rm_server else False,
398+
requires_label=True if training_args.use_rm_server or training_args.use_rule_reward else False,
393399
max_prompt_len=data_args.max_prompt_len if training_args.balance_batch else None,
394400
), # NOTE: enforce prompt padding to max_prompt_len when using balance_batch
395401
compute_metrics=compute_metrics, # TODO: only used for grpo (kk datasets)

0 commit comments

Comments
 (0)