mirror of
https://github.com/CodedotAl/gpt-code-clippy.git
synced 2024-08-16 10:20:28 +03:00
upd apps fine-tuning script
This commit is contained in:
parent
6020b3cb86
commit
ec042fd0fb
58
EDA.ipynb
58
EDA.ipynb
@ -1,28 +1,4 @@
|
||||
{
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
},
|
||||
"orig_nbformat": 4,
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3.8.10 64-bit ('.venv': venv)"
|
||||
},
|
||||
"interpreter": {
|
||||
"hash": "25d32b45eddbc1e23c07b06a2c9ff49f418e028170a37fc806346e2c2002bf83"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2,
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
@ -30,8 +6,8 @@
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
|
||||
]
|
||||
@ -54,8 +30,8 @@
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Using the latest cached version of the module from /home/shpotes/.cache/huggingface/modules/datasets_modules/datasets/code_clippy/86b09b4a623c1c39753a8ad165e05757d9a97daf132ac71d3b6eb791e7da16dd (last modified on Fri Jul 9 22:06:59 2021) since it couldn't be found locally at $HOME/gpt-code-clippy/code_clippy.py/code_clippy.py or remotely (FileNotFoundError).\n",
|
||||
"Using the latest cached version of the module from /home/shpotes/.cache/huggingface/modules/datasets_modules/datasets/code_clippy/86b09b4a623c1c39753a8ad165e05757d9a97daf132ac71d3b6eb791e7da16dd (last modified on Fri Jul 9 22:06:59 2021) since it couldn't be found locally at $HOME/gpt-code-clippy/code_clippy.py/code_clippy.py or remotely (FileNotFoundError).\n",
|
||||
@ -183,8 +159,8 @@
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0it [00:00, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2708 > 2048). Running this sequence through the model will result in indexing errors\n",
|
||||
"61it [00:55, 1.36it/s]"
|
||||
@ -204,5 +180,29 @@
|
||||
"collapse_metrics(eval_dataset, 'eval_metrics.pkl')"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"interpreter": {
|
||||
"hash": "25d32b45eddbc1e23c07b06a2c9ff49f418e028170a37fc806346e2c2002bf83"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
@ -1,29 +1,29 @@
|
||||
#! /bin/bash
|
||||
./run_clm_apps.py \
|
||||
--output_dir $HOME/gpt-code-clippy-apps-2 \
|
||||
--model_name_or_path EleutherAI/gpt-neo-1.3B \
|
||||
--output_dir $HOME/gpt-code-clippy-apps-125m-2048-raw \
|
||||
--model_name_or_path $HOME/gpt-code-clippy-125M-bs2048-raw \
|
||||
--dataset_name ./apps.py \
|
||||
--do_train --do_eval \
|
||||
--block_size="1024" \
|
||||
--per_device_train_batch_size="2" \
|
||||
--per_device_eval_batch_size="2" \
|
||||
--per_device_train_batch_size="16" \
|
||||
--per_device_eval_batch_size="16" \
|
||||
--preprocessing_num_workers="16" \
|
||||
--learning_rate="2e-5" \
|
||||
--warmup_steps="5000" \
|
||||
--learning_rate="5e-5" \
|
||||
--warmup_steps="800" \
|
||||
--adam_beta1="0.9" \
|
||||
--adam_beta2="0.98" \
|
||||
--weight_decay="0.1" \
|
||||
--overwrite_output_dir \
|
||||
--num_train_epochs="5" \
|
||||
--logging_steps="20" \
|
||||
--eval_steps="1000" \
|
||||
--eval_steps="100" \
|
||||
--push_to_hub="False" \
|
||||
--report_to="wandb" \
|
||||
--dtype="bfloat16" \
|
||||
--skip_memory_metrics="False" \
|
||||
--save_steps="1000" \
|
||||
--save_steps="100" \
|
||||
--save_strategy epoch \
|
||||
--save_total_limit 2 \
|
||||
--save_total_limit 5 \
|
||||
--gradient_accumulation_steps 2 \
|
||||
--adafactor \
|
||||
# --resume_from_checkpoint $HOME/gpt-neo-125M-code-clippy/ckpt_201 \
|
||||
|
35
run_clm_apps.py
Normal file → Executable file
35
run_clm_apps.py
Normal file → Executable file
@ -165,9 +165,13 @@ class DataTrainingArguments:
|
||||
metadata={"help": "The number of processes to use for the preprocessing."},
|
||||
)
|
||||
text_column_name: Optional[str] = field(
|
||||
default='text',
|
||||
metadata={"help": "Column containing main text data."},
|
||||
default='text',
|
||||
metadata={"help": "Column containing main text data."},
|
||||
)
|
||||
all_data: Optional[bool] = field(
|
||||
default=False,
|
||||
metadata={"help":"If True will use all data ignoring original APPS splits."}
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
|
||||
@ -381,18 +385,23 @@ def main():
|
||||
# download the dataset.
|
||||
if data_args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
whole_dataset = load_dataset(
|
||||
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, keep_in_memory=False
|
||||
)
|
||||
if data_args.all_data:
|
||||
whole_dataset = load_dataset(
|
||||
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, keep_in_memory=False
|
||||
)
|
||||
|
||||
whole_dataset = concatenate_datasets([whole_dataset["train"], whole_dataset["test"]])
|
||||
split_id = int(0.9*len(whole_dataset))
|
||||
train_idx = list(range(split_id))
|
||||
valid_idx = list(range(split_id, len(whole_dataset)))
|
||||
dataset = DatasetDict({
|
||||
"train":whole_dataset.select(train_idx),
|
||||
"validation":whole_dataset.select(valid_idx)
|
||||
})
|
||||
whole_dataset = concatenate_datasets([whole_dataset["train"], whole_dataset["test"]])
|
||||
split_id = int(0.9*len(whole_dataset))
|
||||
train_idx = list(range(split_id))
|
||||
valid_idx = list(range(split_id, len(whole_dataset)))
|
||||
dataset = DatasetDict({
|
||||
"train":whole_dataset.select(train_idx),
|
||||
"validation":whole_dataset.select(valid_idx)
|
||||
})
|
||||
else:
|
||||
dataset = load_dataset(
|
||||
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, keep_in_memory=False
|
||||
)
|
||||
|
||||
if "validation" not in dataset.keys():
|
||||
dataset["validation"] = load_dataset(
|
||||
|
Loading…
Reference in New Issue
Block a user