Adds fine-tuning notebooks (#18)

2024-09-11 15:05:44 +03:00 · 2021-07-05 21:24:02 +03:00 · 2021-07-05 21:24:02 +03:00 · a53dfe2f03
commit a53dfe2f03
parent ff58c4ae95
7 changed files with 6962 additions and 959 deletions
--- a/flax-gpt-neo-clm-v2.ipynb
+++ b/flax-gpt-neo-clm-v2.ipynb
--- a/flax-gpt-neo-clm.ipynb
+++ b/flax-gpt-neo-clm.ipynb
--- a/run_clm_flax.py
+++ b/run_clm_flax.py
@ -170,6 +170,10 @@ class DataTrainingArguments:
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
+    text_column_name: Optional[str] = field(
+        default='text',
+        metadata={"help": "Column containing main text data."},
+    )

    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@ -366,7 +370,7 @@ def main():
        column_names = dataset["train"].column_names
    else:
        column_names = dataset["validation"].column_names
-    text_column_name = "text" if "text" in column_names else column_names[0]
+    text_column_name = data_args.text_column_name

    # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
    tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
--- a/run_clm_flax_neo.py
+++ b/run_clm_flax_neo.py
@ -170,6 +170,10 @@ class DataTrainingArguments:
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
+    text_column_name: Optional[str] = field(
+        default='text',
+        metadata={"help": "Column containing main text data."},
+    )

    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@ -366,7 +370,7 @@ def main():
        column_names = dataset["train"].column_names
    else:
        column_names = dataset["validation"].column_names
-    text_column_name = "text" if "text" in column_names else column_names[0]
+    text_column_name = data_args.text_column_name

    # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
    tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
--- a/run_clm_gpt_neo.sh
+++ b/run_clm_gpt_neo.sh
@ -4,10 +4,12 @@
    --model_name_or_path="EleutherAI/gpt-neo-125M" \
    --dataset_name="code_search_net" \
    --dataset_config_name="python" \
+    --text_column_name="func_code_string" \
    --do_train --do_eval \
    --block_size="128" \
    --per_device_train_batch_size="64" \
    --per_device_eval_batch_size="128" \
+    --preprocessing_num_workers="8" \
    --learning_rate="5e-3" \
    --warmup_steps="1000" \
    --adam_beta1="0.9" \
@ -15,4 +17,4 @@
    --weight_decay="0.01" \
    --overwrite_output_dir \
    --num_train_epochs="1" \
-    --push_to_hub False
+    --push_to_hub="False"
--- a/run_clm_gpt_neo_13b.sh
+++ b/run_clm_gpt_neo_13b.sh
@ -4,10 +4,12 @@
    --model_name_or_path="EleutherAI/gpt-neo-1.3B" \
    --dataset_name="code_search_net" \
    --dataset_config_name="python" \
+    --text_column_name="func_code_string" \
    --do_train --do_eval \
    --block_size="128" \
-    --per_device_train_batch_size="4" \
-    --per_device_eval_batch_size="8" \
+    --per_device_train_batch_size="1" \
+    --per_device_eval_batch_size="2" \
+    --preprocessing_num_workers="8" \
    --dtype="bfloat16" \
    --learning_rate="5e-3" \
    --warmup_steps="1000" \
@ -16,4 +18,4 @@
    --weight_decay="0.01" \
    --overwrite_output_dir \
    --num_train_epochs="1" \
-    --push_to_hub False
+    --push_to_hub="False"
--- a/run_clm_gpt_neo_27b.sh
+++ b/run_clm_gpt_neo_27b.sh
@ -4,10 +4,12 @@
    --model_name_or_path="EleutherAI/gpt-neo-2.7B" \
    --dataset_name="code_search_net" \
    --dataset_config_name="python" \
+    --text_column_name="func_code_string" \
    --do_train --do_eval \
    --block_size="128" \
    --per_device_train_batch_size="1" \
    --per_device_eval_batch_size="1" \
+    --preprocessing_num_workers="8" \
    --dtype="bfloat16" \
    --learning_rate="5e-3" \
    --warmup_steps="1000" \
@ -16,4 +18,4 @@
    --weight_decay="0.01" \
    --overwrite_output_dir \
    --num_train_epochs="1" \
-    --push_to_hub False
+    --push_to_hub="False"