mirror of
https://github.com/facebookresearch/fairseq.git
synced 2024-08-16 12:00:25 +03:00
Rename optimization.min_lr -> optimization.stop_min_lr (#1486)
Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/1486 Test Plan: Imported from OSS Reviewed By: alexeib Differential Revision: D25342181 Pulled By: myleott fbshipit-source-id: 7d1cfb26334fff26d688648724ab073e5fb956f5
This commit is contained in:
parent
4df4d0af8d
commit
72a25a4e52
@ -182,9 +182,10 @@ sure to update ``--master_addr`` to the IP address of the first node:
|
||||
--arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \
|
||||
--optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
|
||||
--lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 \
|
||||
--lr 0.0005 --min-lr 1e-09 \
|
||||
--lr 0.0005 \
|
||||
--dropout 0.3 --weight-decay 0.0 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
|
||||
--max-tokens 3584 \
|
||||
--max-epoch 70 \
|
||||
--fp16
|
||||
|
||||
On SLURM clusters, fairseq will automatically detect the number of nodes and
|
||||
|
@ -61,7 +61,7 @@ fairseq-train \
|
||||
--max-update 2400000 --save-interval 1 --no-epoch-checkpoints \
|
||||
--arch xlm_base \
|
||||
--optimizer adam --lr-scheduler reduce_lr_on_plateau \
|
||||
--lr-shrink 0.5 --lr 0.0001 --min-lr 1e-09 \
|
||||
--lr-shrink 0.5 --lr 0.0001 --stop-min-lr 1e-09 \
|
||||
--dropout 0.1 \
|
||||
--criterion legacy_masked_lm_loss \
|
||||
--max-tokens 2048 --tokens-per-sample 256 --attention-dropout 0.1 \
|
||||
|
@ -20,7 +20,7 @@ fairseq-train --task language_modeling \
|
||||
--save-dir checkpoints/transformer_wikitext-103 \
|
||||
--arch transformer_lm_wiki103 \
|
||||
--max-update 286000 --max-lr 1.0 --t-mult 2 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 \
|
||||
--warmup-updates 16000 --warmup-init-lr 1e-07 --min-lr 1e-09 --optimizer nag --lr 0.0001 --clip-norm 0.1 \
|
||||
--warmup-updates 16000 --warmup-init-lr 1e-07 --stop-min-lr 1e-09 --optimizer nag --lr 0.0001 --clip-norm 0.1 \
|
||||
--criterion adaptive_loss --max-tokens 3072 --update-freq 3 --tokens-per-sample 3072 --seed 1 \
|
||||
--sample-break-mode none --skip-invalid-size-inputs-valid-test --ddp-backend=no_c10d
|
||||
```
|
||||
|
@ -25,7 +25,7 @@ fairseq-train ${databin_dir} \
|
||||
--share-decoder-input-output-embed \
|
||||
--dropout 0.3 --attention-dropout 0.3 \
|
||||
--optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
|
||||
--lr-scheduler inverse_sqrt --min-lr 1e-9 --warmup-init-lr 1e-7 --warmup-updates 8000 \
|
||||
--lr-scheduler inverse_sqrt --stop-min-lr 1e-9 --warmup-init-lr 1e-7 --warmup-updates 8000 \
|
||||
--max-tokens 4096 --update-freq 1 \
|
||||
--lr 0.0015 \
|
||||
--clip-norm 1.0 \
|
||||
|
@ -73,7 +73,7 @@ fairseq-train path_2_data \
|
||||
--source-lang en_XX --target-lang ro_RO \
|
||||
--criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
|
||||
--optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
|
||||
--lr-scheduler polynomial_decay --lr 3e-05 --min-lr -1 --warmup-updates 2500 --total-num-update 40000 \
|
||||
--lr-scheduler polynomial_decay --lr 3e-05 --warmup-updates 2500 --total-num-update 40000 \
|
||||
--dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
|
||||
--max-tokens 1024 --update-freq 2 \
|
||||
--save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
|
||||
|
@ -41,7 +41,7 @@ fairseq-train $path_2_data \
|
||||
--lang-pairs "$lang_pairs" \
|
||||
--criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
|
||||
--optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
|
||||
--lr-scheduler inverse_sqrt --lr 3e-05 --min-lr -1 --warmup-updates 2500 --max-update 40000 \
|
||||
--lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \
|
||||
--dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
|
||||
--max-tokens 1024 --update-freq 2 \
|
||||
--save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
|
||||
@ -69,7 +69,7 @@ fairseq-train $path_2_data \
|
||||
--lang-pairs "$lang_pairs" \
|
||||
--criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
|
||||
--optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
|
||||
--lr-scheduler inverse_sqrt --lr 3e-05 --min-lr -1 --warmup-updates 2500 --max-update 40000 \
|
||||
--lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \
|
||||
--dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
|
||||
--max-tokens 1024 --update-freq 2 \
|
||||
--save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
|
||||
|
@ -20,7 +20,7 @@ fairseq-train "$path_2_data" \
|
||||
--lang-pairs "$lang_pairs" \
|
||||
--criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
|
||||
--optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
|
||||
--lr-scheduler inverse_sqrt --lr 3e-05 --min-lr -1 --warmup-updates 2500 --max-update 40000 \
|
||||
--lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \
|
||||
--dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
|
||||
--max-tokens 1024 --update-freq 2 \
|
||||
--save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
|
||||
|
@ -16,7 +16,7 @@ fairseq-train "$path_2_data" \
|
||||
--lang-pairs "$lang_pairs" \
|
||||
--criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
|
||||
--optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
|
||||
--lr-scheduler inverse_sqrt --lr 3e-05 --min-lr -1 --warmup-updates 2500 --max-update 40000 \
|
||||
--lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \
|
||||
--dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
|
||||
--max-tokens 1024 --update-freq 2 \
|
||||
--save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
|
||||
|
@ -44,7 +44,7 @@ fairseq-train \
|
||||
--share-all-embeddings \
|
||||
--optimizer adam --adam-betas '(0.9,0.98)' \
|
||||
--lr 0.0005 --lr-scheduler inverse_sqrt \
|
||||
--min-lr '1e-09' --warmup-updates 10000 \
|
||||
--stop-min-lr '1e-09' --warmup-updates 10000 \
|
||||
--warmup-init-lr '1e-07' --label-smoothing 0.1 \
|
||||
--dropout 0.3 --weight-decay 0.01 \
|
||||
--decoder-learned-pos \
|
||||
|
@ -14,7 +14,7 @@ fairseq-train \
|
||||
--share-all-embeddings \
|
||||
--optimizer adam --adam-betas '(0.9,0.98)' \
|
||||
--lr 0.0005 --lr-scheduler inverse_sqrt \
|
||||
--min-lr '1e-09' --warmup-updates 10000 \
|
||||
--stop-min-lr '1e-09' --warmup-updates 10000 \
|
||||
--warmup-init-lr '1e-07' --label-smoothing 0.1 \
|
||||
--dropout 0.3 --weight-decay 0.01 \
|
||||
--decoder-learned-pos \
|
||||
@ -43,7 +43,7 @@ fairseq-train \
|
||||
--share-all-embeddings \
|
||||
--optimizer adam --adam-betas '(0.9,0.98)' \
|
||||
--lr 0.0005 --lr-scheduler inverse_sqrt \
|
||||
--min-lr '1e-09' --warmup-updates 10000 \
|
||||
--stop-min-lr '1e-09' --warmup-updates 10000 \
|
||||
--warmup-init-lr '1e-07' --label-smoothing 0.1 \
|
||||
--dropout 0.3 --weight-decay 0.01 \
|
||||
--decoder-learned-pos \
|
||||
@ -76,7 +76,7 @@ fairseq-train \
|
||||
--share-all-embeddings \
|
||||
--optimizer adam --adam-betas '(0.9,0.98)' \
|
||||
--lr 0.0005 --lr-scheduler inverse_sqrt \
|
||||
--min-lr '1e-09' --warmup-updates 10000 \
|
||||
--stop-min-lr '1e-09' --warmup-updates 10000 \
|
||||
--warmup-init-lr '1e-07' --label-smoothing 0.1 \
|
||||
--dropout 0.3 --weight-decay 0.01 \
|
||||
--decoder-learned-pos \
|
||||
@ -109,7 +109,7 @@ fairseq-train \
|
||||
--share-all-embeddings \
|
||||
--optimizer adam --adam-betas '(0.9,0.98)' \
|
||||
--lr 0.0005 --lr-scheduler inverse_sqrt \
|
||||
--min-lr '1e-09' --warmup-updates 10000 \
|
||||
--stop-min-lr '1e-09' --warmup-updates 10000 \
|
||||
--warmup-init-lr '1e-07' --label-smoothing 0.1 \
|
||||
--dropout 0.3 --weight-decay 0.01 \
|
||||
--decoder-learned-pos \
|
||||
@ -136,7 +136,7 @@ fairseq-train \
|
||||
--share-all-embeddings \
|
||||
--optimizer adam --adam-betas '(0.9,0.98)' \
|
||||
--lr 0.0005 --lr-scheduler inverse_sqrt \
|
||||
--min-lr '1e-09' --warmup-updates 10000 \
|
||||
--stop-min-lr '1e-09' --warmup-updates 10000 \
|
||||
--warmup-init-lr '1e-07' --label-smoothing 0.1 \
|
||||
--dropout 0.3 --weight-decay 0.01 \
|
||||
--decoder-learned-pos \
|
||||
@ -165,7 +165,7 @@ fairseq-train \
|
||||
--share-all-embeddings \
|
||||
--optimizer adam --adam-betas '(0.9,0.98)' \
|
||||
--lr 0.0005 --lr-scheduler inverse_sqrt \
|
||||
--min-lr '1e-09' --warmup-updates 10000 \
|
||||
--stop-min-lr '1e-09' --warmup-updates 10000 \
|
||||
--warmup-init-lr '1e-07' --label-smoothing 0.1 \
|
||||
--dropout 0.3 --weight-decay 0.01 \
|
||||
--decoder-learned-pos \
|
||||
|
@ -110,7 +110,7 @@ mkdir -p $SAVE
|
||||
CUDA_VISIBLE_DEVICES=0 $(which fairseq-train) data-bin/iwslt14.tokenized.de-en \
|
||||
--clip-norm 0 --optimizer adam --lr 0.0005 \
|
||||
--source-lang de --target-lang en --max-tokens 4000 --no-progress-bar \
|
||||
--log-interval 100 --min-lr '1e-09' --weight-decay 0.0001 \
|
||||
--log-interval 100 --stop-min-lr '1e-09' --weight-decay 0.0001 \
|
||||
--criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
|
||||
--lr-scheduler inverse_sqrt \
|
||||
--ddp-backend=no_c10d \
|
||||
@ -137,10 +137,10 @@ python -m torch.distributed.launch --nproc_per_node 8 $(which fairseq-train) \
|
||||
--max-update 30000 --share-all-embeddings --optimizer adam \
|
||||
--adam-betas '(0.9, 0.98)' --clip-norm 0.0 --weight-decay 0.0 \
|
||||
--criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
|
||||
--min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \
|
||||
--stop-min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \
|
||||
--ddp-backend=no_c10d --max-tokens 3584 \
|
||||
--lr-scheduler cosine --warmup-init-lr 1e-7 --warmup-updates 10000 \
|
||||
--lr-shrink 1 --max-lr 0.001 --lr 1e-7 --min-lr 1e-9 --warmup-init-lr 1e-07 \
|
||||
--lr-shrink 1 --max-lr 0.001 --lr 1e-7 --warmup-init-lr 1e-07 \
|
||||
--t-mult 1 --lr-period-updates 20000 \
|
||||
--arch lightconv_wmt_en_de_big --save-dir $SAVE \
|
||||
--dropout 0.3 --attention-dropout 0.1 --weight-dropout 0.1 \
|
||||
@ -162,10 +162,10 @@ python -m torch.distributed.launch --nproc_per_node 8 $(which fairseq-train) \
|
||||
--max-update 30000 --share-all-embeddings --optimizer adam \
|
||||
--adam-betas '(0.9, 0.98)' --clip-norm 0.0 --weight-decay 0.0 \
|
||||
--criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
|
||||
--min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \
|
||||
--stop-min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \
|
||||
--ddp-backend=no_c10d --max-tokens 3584 \
|
||||
--lr-scheduler cosine --warmup-init-lr 1e-7 --warmup-updates 10000 \
|
||||
--lr-shrink 1 --max-lr 0.001 --lr 1e-7 --min-lr 1e-9 --warmup-init-lr 1e-07 \
|
||||
--lr-shrink 1 --max-lr 0.001 --lr 1e-7 --warmup-init-lr 1e-07 \
|
||||
--t-mult 1 --lr-period-updates 70000 \
|
||||
--arch lightconv_wmt_en_fr_big --save-dir $SAVE \
|
||||
--dropout 0.1 --attention-dropout 0.1 --weight-dropout 0.1 \
|
||||
|
@ -212,7 +212,7 @@ fairseq-train --task language_modeling /path/to/wikitext-103/data \
|
||||
--max-tokens 3072 --tokens-per-sample 3072 --momentum 0.99 --optimizer nag \
|
||||
--sample-break-mode none --update-freq 3 \
|
||||
--warmup-init-lr 1e-07 --warmup-updates 16000 \
|
||||
--weight-decay 0 --seed 1 --min-lr 1e-09 \
|
||||
--weight-decay 0 --seed 1 --stop-min-lr 1e-09 \
|
||||
--quant-noise-pq 0.05 --quant-noise-pq-block-size 8
|
||||
```
|
||||
|
||||
@ -269,7 +269,7 @@ fairseq-train --task language_modeling /path/to/wikitext-103/data \
|
||||
--ddp-backend no_c10d \
|
||||
--decoder-attention-heads 8 --decoder-embed-dim 1024 --decoder-ffn-embed-dim 4096 --decoder-input-dim 1024 --decoder-layers 16 --decoder-normalize-before --decoder-output-dim 1024 \
|
||||
--fp16 --keep-last-epochs -1 \
|
||||
--lr 0.0001 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 --max-lr 0.05 --min-lr 1e-09 \
|
||||
--lr 0.0001 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 --max-lr 0.05 --stop-min-lr 1e-09 \
|
||||
--max-tokens 2944 --tokens-per-sample 2944\
|
||||
--momentum 0.99 --no-epoch-checkpoints --no-progress-bar --optimizer nag --required-batch-size-multiple 8 \
|
||||
--sample-break-mode none --t-mult 2.0 --skip-invalid-size-inputs-valid-test \
|
||||
|
@ -23,7 +23,7 @@ fairseq-train \
|
||||
--optimizer adam --adam-betas '(0.9, 0.98)' \
|
||||
--lr-scheduler 'inverse_sqrt' \
|
||||
--warmup-init-lr 1e-7 --warmup-updates 4000 \
|
||||
--lr 5e-4 --min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\
|
||||
--lr 5e-4 --stop-min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\
|
||||
--dropout 0.3 \
|
||||
--label-smoothing 0.1\
|
||||
--max-tokens 3584
|
||||
@ -44,7 +44,7 @@ fairseq-train \
|
||||
--optimizer adam --adam-betas '(0.9, 0.98)' \
|
||||
--lr-scheduler 'inverse_sqrt' \
|
||||
--warmup-init-lr 1e-7 --warmup-updates 4000 \
|
||||
--lr 5e-4 --min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\
|
||||
--lr 5e-4 --stop-min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\
|
||||
--dropout 0.3 \
|
||||
--label-smoothing 0.1\
|
||||
--max-tokens 3584
|
||||
@ -65,7 +65,7 @@ fairseq-train \
|
||||
--optimizer adam --adam-betas '(0.9, 0.98)' \
|
||||
--lr-scheduler 'inverse_sqrt' \
|
||||
--warmup-init-lr 1e-7 --warmup-updates 4000 \
|
||||
--lr 5e-4 --min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\
|
||||
--lr 5e-4 --stop-min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\
|
||||
--dropout 0.3 \
|
||||
--label-smoothing 0.1\
|
||||
--max-tokens 3584
|
||||
|
@ -268,7 +268,7 @@ CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/iwslt17.de_fr.en.bpe16k/ \
|
||||
--arch multilingual_transformer_iwslt_de_en \
|
||||
--share-decoders --share-decoder-input-output-embed \
|
||||
--optimizer adam --adam-betas '(0.9, 0.98)' \
|
||||
--lr 0.0005 --lr-scheduler inverse_sqrt --min-lr '1e-09' \
|
||||
--lr 0.0005 --lr-scheduler inverse_sqrt \
|
||||
--warmup-updates 4000 --warmup-init-lr '1e-07' \
|
||||
--label-smoothing 0.1 --criterion label_smoothed_cross_entropy \
|
||||
--dropout 0.3 --weight-decay 0.0001 \
|
||||
|
@ -24,7 +24,7 @@ fairseq-train --ddp-backend='no_c10d' \
|
||||
--arch transformer_wmt_en_de --share-all-embeddings \
|
||||
--optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
|
||||
--lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 \
|
||||
--lr 0.0007 --min-lr 1e-09 \
|
||||
--lr 0.0007 \
|
||||
--dropout 0.1 --weight-decay 0.0 --criterion cross_entropy \
|
||||
--max-tokens 3584
|
||||
```
|
||||
|
@ -186,7 +186,7 @@ $ python examples/wav2vec/wav2vec_manifest.py /path/to/waves --dest /manifest/pa
|
||||
|
||||
```
|
||||
$ python train.py /manifest/path --save-dir /model/path --num-workers 6 --fp16 --max-update 400000 --save-interval 1 --no-epoch-checkpoints \
|
||||
--arch wav2vec --task audio_pretraining --lr 1e-06 --min-lr 1e-09 --optimizer adam --max-lr 0.005 --lr-scheduler cosine \
|
||||
--arch wav2vec --task audio_pretraining --lr 1e-06 --stop-min-lr 1e-09 --optimizer adam --max-lr 0.005 --lr-scheduler cosine \
|
||||
--conv-feature-layers [(512, 10, 5), (512, 8, 4), (512, 4, 2), (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1)] \
|
||||
--conv-aggregator-layers [(512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)] \
|
||||
--skip-connections-agg --residual-scale 0.5 --log-compression --warmup-updates 500 --warmup-init-lr 1e-07 --criterion wav2vec --num-negatives 10 \
|
||||
@ -244,7 +244,7 @@ $ python examples/wav2vec/wav2vec_manifest.py /path/to/waves --dest /manifest/pa
|
||||
|
||||
```
|
||||
$ python train.py /manifest/path --save-dir /model/path --num-workers 6 --fp16 --max-update 400000 \
|
||||
--save-interval 1 --no-epoch-checkpoints --arch wav2vec --task audio_pretraining --lr 1e-06 --min-lr 1e-09 \
|
||||
--save-interval 1 --no-epoch-checkpoints --arch wav2vec --task audio_pretraining --lr 1e-06 --stop-min-lr 1e-09 \
|
||||
--optimizer adam --max-lr 1e-05 --lr-scheduler cosine \
|
||||
--conv-feature-layers [(512, 10, 5), (512, 8, 4), (512, 4, 2), (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1), (512, 1, 1)] \
|
||||
--conv-aggregator-layers [(512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)] \
|
||||
|
@ -458,7 +458,6 @@ def _upgrade_state_dict(state):
|
||||
"iterations_in_epoch": state["extra_state"].get("batch_offset", 0),
|
||||
}
|
||||
|
||||
# old model checkpoints may not have separate source/target positions
|
||||
# backward compatibility, cfg updates
|
||||
if "args" in state and state["args"] is not None:
|
||||
# default to translation task
|
||||
@ -474,15 +473,20 @@ def _upgrade_state_dict(state):
|
||||
state["extra_state"]["train_iterator"]["epoch"] = max(
|
||||
state["extra_state"]["train_iterator"].get("epoch", 1), 1
|
||||
)
|
||||
|
||||
# --remove-bpe ==> --postprocess
|
||||
if hasattr(state["args"], "remove_bpe"):
|
||||
state["args"].post_process = state["args"].remove_bpe
|
||||
# --min-lr ==> --stop-min-lr
|
||||
if hasattr(state["args"], "min_lr"):
|
||||
state["args"].stop_min_lr = state["args"].min_lr
|
||||
del state["args"].min_lr
|
||||
|
||||
state["cfg"] = convert_namespace_to_omegaconf(state["args"])
|
||||
|
||||
if "cfg" in state and state["cfg"] is not None:
|
||||
with open_dict(state["cfg"]):
|
||||
if state["cfg"].task is not None:
|
||||
# old model checkpoints may not have separate source/target positions
|
||||
if hasattr(state["cfg"].task, "max_positions") and not hasattr(
|
||||
state["cfg"].task, "max_source_positions"
|
||||
):
|
||||
|
@ -465,7 +465,7 @@ class OptimizationConfig(FairseqDataclass):
|
||||
" (note: this may be interpreted differently depending on --lr-scheduler)"
|
||||
},
|
||||
)
|
||||
min_lr: float = field(
|
||||
stop_min_lr: float = field(
|
||||
default=-1.0,
|
||||
metadata={"help": "stop training when the learning rate reaches this minimum"},
|
||||
)
|
||||
|
@ -125,7 +125,15 @@ def main(cfg: DictConfig) -> None:
|
||||
lr = trainer.get_lr()
|
||||
train_meter = meters.StopwatchMeter()
|
||||
train_meter.start()
|
||||
while lr > cfg.optimization.min_lr and epoch_itr.next_epoch_idx <= max_epoch:
|
||||
while epoch_itr.next_epoch_idx <= max_epoch:
|
||||
if lr <= cfg.optimization.stop_min_lr:
|
||||
logger.info(
|
||||
f"stopping training because current learning rate ({lr}) is smaller "
|
||||
"than or equal to minimum learning rate "
|
||||
f"(--stop-min-lr={cfg.optimization.stop_min_lr})"
|
||||
)
|
||||
break
|
||||
|
||||
# train for one epoch
|
||||
valid_losses, should_stop = train(cfg, trainer, task, epoch_itr)
|
||||
if should_stop:
|
||||
|
@ -1454,7 +1454,7 @@ def train_legacy_masked_language_model(data_dir, arch, extra_args=()):
|
||||
"0.5",
|
||||
"--lr",
|
||||
"0.0001",
|
||||
"--min-lr",
|
||||
"--stop-min-lr",
|
||||
"1e-09",
|
||||
# dropout, attention args
|
||||
"--dropout",
|
||||
|
Loading…
Reference in New Issue
Block a user