diff --git a/EDA.ipynb b/EDA.ipynb index 76baf66..1c65ba1 100644 --- a/EDA.ipynb +++ b/EDA.ipynb @@ -1,28 +1,4 @@ { - "metadata": { - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - }, - "orig_nbformat": 4, - "kernelspec": { - "name": "python3", - "display_name": "Python 3.8.10 64-bit ('.venv': venv)" - }, - "interpreter": { - "hash": "25d32b45eddbc1e23c07b06a2c9ff49f418e028170a37fc806346e2c2002bf83" - } - }, - "nbformat": 4, - "nbformat_minor": 2, "cells": [ { "cell_type": "code", @@ -30,8 +6,8 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n" ] @@ -54,8 +30,8 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "Using the latest cached version of the module from /home/shpotes/.cache/huggingface/modules/datasets_modules/datasets/code_clippy/86b09b4a623c1c39753a8ad165e05757d9a97daf132ac71d3b6eb791e7da16dd (last modified on Fri Jul 9 22:06:59 2021) since it couldn't be found locally at $HOME/gpt-code-clippy/code_clippy.py/code_clippy.py or remotely (FileNotFoundError).\n", "Using the latest cached version of the module from /home/shpotes/.cache/huggingface/modules/datasets_modules/datasets/code_clippy/86b09b4a623c1c39753a8ad165e05757d9a97daf132ac71d3b6eb791e7da16dd (last modified on Fri Jul 9 22:06:59 2021) since it couldn't be found locally at $HOME/gpt-code-clippy/code_clippy.py/code_clippy.py or remotely (FileNotFoundError).\n", @@ -183,8 +159,8 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "0it [00:00, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2708 > 2048). Running this sequence through the model will result in indexing errors\n", "61it [00:55, 1.36it/s]" @@ -204,5 +180,29 @@ "collapse_metrics(eval_dataset, 'eval_metrics.pkl')" ] } - ] -} \ No newline at end of file + ], + "metadata": { + "interpreter": { + "hash": "25d32b45eddbc1e23c07b06a2c9ff49f418e028170a37fc806346e2c2002bf83" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/finetune_apps.sh b/finetune_apps.sh index bd0d454..49a122b 100644 --- a/finetune_apps.sh +++ b/finetune_apps.sh @@ -1,29 +1,29 @@ #! /bin/bash ./run_clm_apps.py \ - --output_dir $HOME/gpt-code-clippy-apps-2 \ - --model_name_or_path EleutherAI/gpt-neo-1.3B \ + --output_dir $HOME/gpt-code-clippy-apps-125m-2048-raw \ + --model_name_or_path $HOME/gpt-code-clippy-125M-bs2048-raw \ --dataset_name ./apps.py \ --do_train --do_eval \ --block_size="1024" \ - --per_device_train_batch_size="2" \ - --per_device_eval_batch_size="2" \ + --per_device_train_batch_size="16" \ + --per_device_eval_batch_size="16" \ --preprocessing_num_workers="16" \ - --learning_rate="2e-5" \ - --warmup_steps="5000" \ + --learning_rate="5e-5" \ + --warmup_steps="800" \ --adam_beta1="0.9" \ --adam_beta2="0.98" \ --weight_decay="0.1" \ --overwrite_output_dir \ --num_train_epochs="5" \ --logging_steps="20" \ - --eval_steps="1000" \ + --eval_steps="100" \ --push_to_hub="False" \ --report_to="wandb" \ --dtype="bfloat16" \ --skip_memory_metrics="False" \ - --save_steps="1000" \ + --save_steps="100" \ --save_strategy epoch \ - --save_total_limit 2 \ + --save_total_limit 5 \ --gradient_accumulation_steps 2 \ --adafactor \ # --resume_from_checkpoint $HOME/gpt-neo-125M-code-clippy/ckpt_201 \ diff --git a/run_clm_apps.py b/run_clm_apps.py old mode 100644 new mode 100755 index 78d5179..42f1ec9 --- a/run_clm_apps.py +++ b/run_clm_apps.py @@ -165,9 +165,13 @@ class DataTrainingArguments: metadata={"help": "The number of processes to use for the preprocessing."}, ) text_column_name: Optional[str] = field( - default='text', - metadata={"help": "Column containing main text data."}, + default='text', + metadata={"help": "Column containing main text data."}, ) + all_data: Optional[bool] = field( + default=False, + metadata={"help":"If True will use all data ignoring original APPS splits."} + ) def __post_init__(self): if self.dataset_name is None and self.train_file is None and self.validation_file is None: @@ -381,18 +385,23 @@ def main(): # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. - whole_dataset = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, keep_in_memory=False - ) + if data_args.all_data: + whole_dataset = load_dataset( + data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, keep_in_memory=False + ) - whole_dataset = concatenate_datasets([whole_dataset["train"], whole_dataset["test"]]) - split_id = int(0.9*len(whole_dataset)) - train_idx = list(range(split_id)) - valid_idx = list(range(split_id, len(whole_dataset))) - dataset = DatasetDict({ - "train":whole_dataset.select(train_idx), - "validation":whole_dataset.select(valid_idx) - }) + whole_dataset = concatenate_datasets([whole_dataset["train"], whole_dataset["test"]]) + split_id = int(0.9*len(whole_dataset)) + train_idx = list(range(split_id)) + valid_idx = list(range(split_id, len(whole_dataset))) + dataset = DatasetDict({ + "train":whole_dataset.select(train_idx), + "validation":whole_dataset.select(valid_idx) + }) + else: + dataset = load_dataset( + data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, keep_in_memory=False + ) if "validation" not in dataset.keys(): dataset["validation"] = load_dataset(