upd apps fine-tuning script

This commit is contained in:
arampacha 2021-07-16 19:23:52 +00:00
parent 6020b3cb86
commit ec042fd0fb
3 changed files with 60 additions and 51 deletions

View File

@ -1,28 +1,4 @@
{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"orig_nbformat": 4,
"kernelspec": {
"name": "python3",
"display_name": "Python 3.8.10 64-bit ('.venv': venv)"
},
"interpreter": {
"hash": "25d32b45eddbc1e23c07b06a2c9ff49f418e028170a37fc806346e2c2002bf83"
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"cell_type": "code",
@ -30,8 +6,8 @@
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"output_type": "stream",
"text": [
"None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
]
@ -54,8 +30,8 @@
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"output_type": "stream",
"text": [
"Using the latest cached version of the module from /home/shpotes/.cache/huggingface/modules/datasets_modules/datasets/code_clippy/86b09b4a623c1c39753a8ad165e05757d9a97daf132ac71d3b6eb791e7da16dd (last modified on Fri Jul 9 22:06:59 2021) since it couldn't be found locally at $HOME/gpt-code-clippy/code_clippy.py/code_clippy.py or remotely (FileNotFoundError).\n",
"Using the latest cached version of the module from /home/shpotes/.cache/huggingface/modules/datasets_modules/datasets/code_clippy/86b09b4a623c1c39753a8ad165e05757d9a97daf132ac71d3b6eb791e7da16dd (last modified on Fri Jul 9 22:06:59 2021) since it couldn't be found locally at $HOME/gpt-code-clippy/code_clippy.py/code_clippy.py or remotely (FileNotFoundError).\n",
@ -183,8 +159,8 @@
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"output_type": "stream",
"text": [
"0it [00:00, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2708 > 2048). Running this sequence through the model will result in indexing errors\n",
"61it [00:55, 1.36it/s]"
@ -204,5 +180,29 @@
"collapse_metrics(eval_dataset, 'eval_metrics.pkl')"
]
}
]
}
],
"metadata": {
"interpreter": {
"hash": "25d32b45eddbc1e23c07b06a2c9ff49f418e028170a37fc806346e2c2002bf83"
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -1,29 +1,29 @@
#! /bin/bash
./run_clm_apps.py \
--output_dir $HOME/gpt-code-clippy-apps-2 \
--model_name_or_path EleutherAI/gpt-neo-1.3B \
--output_dir $HOME/gpt-code-clippy-apps-125m-2048-raw \
--model_name_or_path $HOME/gpt-code-clippy-125M-bs2048-raw \
--dataset_name ./apps.py \
--do_train --do_eval \
--block_size="1024" \
--per_device_train_batch_size="2" \
--per_device_eval_batch_size="2" \
--per_device_train_batch_size="16" \
--per_device_eval_batch_size="16" \
--preprocessing_num_workers="16" \
--learning_rate="2e-5" \
--warmup_steps="5000" \
--learning_rate="5e-5" \
--warmup_steps="800" \
--adam_beta1="0.9" \
--adam_beta2="0.98" \
--weight_decay="0.1" \
--overwrite_output_dir \
--num_train_epochs="5" \
--logging_steps="20" \
--eval_steps="1000" \
--eval_steps="100" \
--push_to_hub="False" \
--report_to="wandb" \
--dtype="bfloat16" \
--skip_memory_metrics="False" \
--save_steps="1000" \
--save_steps="100" \
--save_strategy epoch \
--save_total_limit 2 \
--save_total_limit 5 \
--gradient_accumulation_steps 2 \
--adafactor \
# --resume_from_checkpoint $HOME/gpt-neo-125M-code-clippy/ckpt_201 \

35
run_clm_apps.py Normal file → Executable file
View File

@ -165,9 +165,13 @@ class DataTrainingArguments:
metadata={"help": "The number of processes to use for the preprocessing."},
)
text_column_name: Optional[str] = field(
default='text',
metadata={"help": "Column containing main text data."},
default='text',
metadata={"help": "Column containing main text data."},
)
all_data: Optional[bool] = field(
default=False,
metadata={"help":"If True will use all data ignoring original APPS splits."}
)
def __post_init__(self):
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@ -381,18 +385,23 @@ def main():
# download the dataset.
if data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub.
whole_dataset = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, keep_in_memory=False
)
if data_args.all_data:
whole_dataset = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, keep_in_memory=False
)
whole_dataset = concatenate_datasets([whole_dataset["train"], whole_dataset["test"]])
split_id = int(0.9*len(whole_dataset))
train_idx = list(range(split_id))
valid_idx = list(range(split_id, len(whole_dataset)))
dataset = DatasetDict({
"train":whole_dataset.select(train_idx),
"validation":whole_dataset.select(valid_idx)
})
whole_dataset = concatenate_datasets([whole_dataset["train"], whole_dataset["test"]])
split_id = int(0.9*len(whole_dataset))
train_idx = list(range(split_id))
valid_idx = list(range(split_id, len(whole_dataset)))
dataset = DatasetDict({
"train":whole_dataset.select(train_idx),
"validation":whole_dataset.select(valid_idx)
})
else:
dataset = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, keep_in_memory=False
)
if "validation" not in dataset.keys():
dataset["validation"] = load_dataset(