Adds fine-tuning notebooks (#18)

This commit is contained in:
arampacha 2021-07-05 21:24:02 +03:00 committed by GitHub
parent ff58c4ae95
commit a53dfe2f03
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 6962 additions and 959 deletions

6554
flax-gpt-neo-clm-v2.ipynb Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -170,6 +170,10 @@ class DataTrainingArguments:
default=None,
metadata={"help": "The number of processes to use for the preprocessing."},
)
text_column_name: Optional[str] = field(
default='text',
metadata={"help": "Column containing main text data."},
)
def __post_init__(self):
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@ -366,7 +370,7 @@ def main():
column_names = dataset["train"].column_names
else:
column_names = dataset["validation"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]
text_column_name = data_args.text_column_name
# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")

View File

@ -170,6 +170,10 @@ class DataTrainingArguments:
default=None,
metadata={"help": "The number of processes to use for the preprocessing."},
)
text_column_name: Optional[str] = field(
default='text',
metadata={"help": "Column containing main text data."},
)
def __post_init__(self):
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@ -366,7 +370,7 @@ def main():
column_names = dataset["train"].column_names
else:
column_names = dataset["validation"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]
text_column_name = data_args.text_column_name
# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")

View File

@ -4,10 +4,12 @@
--model_name_or_path="EleutherAI/gpt-neo-125M" \
--dataset_name="code_search_net" \
--dataset_config_name="python" \
--text_column_name="func_code_string" \
--do_train --do_eval \
--block_size="128" \
--per_device_train_batch_size="64" \
--per_device_eval_batch_size="128" \
--preprocessing_num_workers="8" \
--learning_rate="5e-3" \
--warmup_steps="1000" \
--adam_beta1="0.9" \
@ -15,4 +17,4 @@
--weight_decay="0.01" \
--overwrite_output_dir \
--num_train_epochs="1" \
--push_to_hub False
--push_to_hub="False"

View File

@ -4,10 +4,12 @@
--model_name_or_path="EleutherAI/gpt-neo-1.3B" \
--dataset_name="code_search_net" \
--dataset_config_name="python" \
--text_column_name="func_code_string" \
--do_train --do_eval \
--block_size="128" \
--per_device_train_batch_size="4" \
--per_device_eval_batch_size="8" \
--per_device_train_batch_size="1" \
--per_device_eval_batch_size="2" \
--preprocessing_num_workers="8" \
--dtype="bfloat16" \
--learning_rate="5e-3" \
--warmup_steps="1000" \
@ -16,4 +18,4 @@
--weight_decay="0.01" \
--overwrite_output_dir \
--num_train_epochs="1" \
--push_to_hub False
--push_to_hub="False"

View File

@ -4,10 +4,12 @@
--model_name_or_path="EleutherAI/gpt-neo-2.7B" \
--dataset_name="code_search_net" \
--dataset_config_name="python" \
--text_column_name="func_code_string" \
--do_train --do_eval \
--block_size="128" \
--per_device_train_batch_size="1" \
--per_device_eval_batch_size="1" \
--preprocessing_num_workers="8" \
--dtype="bfloat16" \
--learning_rate="5e-3" \
--warmup_steps="1000" \
@ -16,4 +18,4 @@
--weight_decay="0.01" \
--overwrite_output_dir \
--num_train_epochs="1" \
--push_to_hub False
--push_to_hub="False"