From 72a25a4e52402b6f53aa98cfb739c075c0d6f7ee Mon Sep 17 00:00:00 2001 From: Myle Ott Date: Sat, 5 Dec 2020 07:36:28 -0800 Subject: [PATCH] Rename optimization.min_lr -> optimization.stop_min_lr (#1486) Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/1486 Test Plan: Imported from OSS Reviewed By: alexeib Differential Revision: D25342181 Pulled By: myleott fbshipit-source-id: 7d1cfb26334fff26d688648724ab073e5fb956f5 --- docs/getting_started.rst | 3 ++- examples/cross_lingual_language_model/README.md | 2 +- examples/language_model/README.adaptive_inputs.md | 2 +- examples/latent_depth/README.md | 2 +- examples/mbart/README.md | 2 +- examples/multilingual/README.md | 4 ++-- examples/multilingual/finetune_multilingual_model.sh | 2 +- examples/multilingual/train_multilingual_model.sh | 2 +- examples/nonautoregressive_translation/README.md | 2 +- examples/nonautoregressive_translation/scripts.md | 12 ++++++------ examples/pay_less_attention_paper/README.md | 10 +++++----- examples/quant_noise/README.md | 4 ++-- examples/simultaneous_translation/README.md | 6 +++--- examples/translation/README.md | 2 +- examples/translation_moe/README.md | 2 +- examples/wav2vec/README.md | 4 ++-- fairseq/checkpoint_utils.py | 8 ++++++-- fairseq/dataclass/configs.py | 2 +- fairseq_cli/train.py | 10 +++++++++- tests/test_binaries.py | 2 +- 20 files changed, 48 insertions(+), 35 deletions(-) diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 5d1d2d69..745ad776 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -182,9 +182,10 @@ sure to update ``--master_addr`` to the IP address of the first node: --arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \ --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \ --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 \ - --lr 0.0005 --min-lr 1e-09 \ + --lr 0.0005 \ --dropout 0.3 --weight-decay 0.0 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --max-tokens 3584 \ + --max-epoch 70 \ --fp16 On SLURM clusters, fairseq will automatically detect the number of nodes and diff --git a/examples/cross_lingual_language_model/README.md b/examples/cross_lingual_language_model/README.md index a78f86d8..f4c76cfe 100644 --- a/examples/cross_lingual_language_model/README.md +++ b/examples/cross_lingual_language_model/README.md @@ -61,7 +61,7 @@ fairseq-train \ --max-update 2400000 --save-interval 1 --no-epoch-checkpoints \ --arch xlm_base \ --optimizer adam --lr-scheduler reduce_lr_on_plateau \ ---lr-shrink 0.5 --lr 0.0001 --min-lr 1e-09 \ +--lr-shrink 0.5 --lr 0.0001 --stop-min-lr 1e-09 \ --dropout 0.1 \ --criterion legacy_masked_lm_loss \ --max-tokens 2048 --tokens-per-sample 256 --attention-dropout 0.1 \ diff --git a/examples/language_model/README.adaptive_inputs.md b/examples/language_model/README.adaptive_inputs.md index 68734671..2ab37330 100644 --- a/examples/language_model/README.adaptive_inputs.md +++ b/examples/language_model/README.adaptive_inputs.md @@ -20,7 +20,7 @@ fairseq-train --task language_modeling \ --save-dir checkpoints/transformer_wikitext-103 \ --arch transformer_lm_wiki103 \ --max-update 286000 --max-lr 1.0 --t-mult 2 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 \ - --warmup-updates 16000 --warmup-init-lr 1e-07 --min-lr 1e-09 --optimizer nag --lr 0.0001 --clip-norm 0.1 \ + --warmup-updates 16000 --warmup-init-lr 1e-07 --stop-min-lr 1e-09 --optimizer nag --lr 0.0001 --clip-norm 0.1 \ --criterion adaptive_loss --max-tokens 3072 --update-freq 3 --tokens-per-sample 3072 --seed 1 \ --sample-break-mode none --skip-invalid-size-inputs-valid-test --ddp-backend=no_c10d ``` diff --git a/examples/latent_depth/README.md b/examples/latent_depth/README.md index a0ec55a3..e70e1640 100644 --- a/examples/latent_depth/README.md +++ b/examples/latent_depth/README.md @@ -25,7 +25,7 @@ fairseq-train ${databin_dir} \ --share-decoder-input-output-embed \ --dropout 0.3 --attention-dropout 0.3 \ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \ - --lr-scheduler inverse_sqrt --min-lr 1e-9 --warmup-init-lr 1e-7 --warmup-updates 8000 \ + --lr-scheduler inverse_sqrt --stop-min-lr 1e-9 --warmup-init-lr 1e-7 --warmup-updates 8000 \ --max-tokens 4096 --update-freq 1 \ --lr 0.0015 \ --clip-norm 1.0 \ diff --git a/examples/mbart/README.md b/examples/mbart/README.md index fa520a68..8a3e22d4 100644 --- a/examples/mbart/README.md +++ b/examples/mbart/README.md @@ -73,7 +73,7 @@ fairseq-train path_2_data \ --source-lang en_XX --target-lang ro_RO \ --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \ - --lr-scheduler polynomial_decay --lr 3e-05 --min-lr -1 --warmup-updates 2500 --total-num-update 40000 \ + --lr-scheduler polynomial_decay --lr 3e-05 --warmup-updates 2500 --total-num-update 40000 \ --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \ --max-tokens 1024 --update-freq 2 \ --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \ diff --git a/examples/multilingual/README.md b/examples/multilingual/README.md index 3559c244..35eca898 100644 --- a/examples/multilingual/README.md +++ b/examples/multilingual/README.md @@ -41,7 +41,7 @@ fairseq-train $path_2_data \ --lang-pairs "$lang_pairs" \ --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \ - --lr-scheduler inverse_sqrt --lr 3e-05 --min-lr -1 --warmup-updates 2500 --max-update 40000 \ + --lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \ --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \ --max-tokens 1024 --update-freq 2 \ --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \ @@ -69,7 +69,7 @@ fairseq-train $path_2_data \ --lang-pairs "$lang_pairs" \ --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \ - --lr-scheduler inverse_sqrt --lr 3e-05 --min-lr -1 --warmup-updates 2500 --max-update 40000 \ + --lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \ --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \ --max-tokens 1024 --update-freq 2 \ --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \ diff --git a/examples/multilingual/finetune_multilingual_model.sh b/examples/multilingual/finetune_multilingual_model.sh index cfa9a861..ffcf1fc7 100644 --- a/examples/multilingual/finetune_multilingual_model.sh +++ b/examples/multilingual/finetune_multilingual_model.sh @@ -20,7 +20,7 @@ fairseq-train "$path_2_data" \ --lang-pairs "$lang_pairs" \ --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \ - --lr-scheduler inverse_sqrt --lr 3e-05 --min-lr -1 --warmup-updates 2500 --max-update 40000 \ + --lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \ --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \ --max-tokens 1024 --update-freq 2 \ --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \ diff --git a/examples/multilingual/train_multilingual_model.sh b/examples/multilingual/train_multilingual_model.sh index 09014c82..c41730df 100644 --- a/examples/multilingual/train_multilingual_model.sh +++ b/examples/multilingual/train_multilingual_model.sh @@ -16,7 +16,7 @@ fairseq-train "$path_2_data" \ --lang-pairs "$lang_pairs" \ --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \ - --lr-scheduler inverse_sqrt --lr 3e-05 --min-lr -1 --warmup-updates 2500 --max-update 40000 \ + --lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \ --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \ --max-tokens 1024 --update-freq 2 \ --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \ diff --git a/examples/nonautoregressive_translation/README.md b/examples/nonautoregressive_translation/README.md index dfc592f0..7b2d42a9 100644 --- a/examples/nonautoregressive_translation/README.md +++ b/examples/nonautoregressive_translation/README.md @@ -44,7 +44,7 @@ fairseq-train \ --share-all-embeddings \ --optimizer adam --adam-betas '(0.9,0.98)' \ --lr 0.0005 --lr-scheduler inverse_sqrt \ - --min-lr '1e-09' --warmup-updates 10000 \ + --stop-min-lr '1e-09' --warmup-updates 10000 \ --warmup-init-lr '1e-07' --label-smoothing 0.1 \ --dropout 0.3 --weight-decay 0.01 \ --decoder-learned-pos \ diff --git a/examples/nonautoregressive_translation/scripts.md b/examples/nonautoregressive_translation/scripts.md index 63b945c1..a3a33e6e 100644 --- a/examples/nonautoregressive_translation/scripts.md +++ b/examples/nonautoregressive_translation/scripts.md @@ -14,7 +14,7 @@ fairseq-train \ --share-all-embeddings \ --optimizer adam --adam-betas '(0.9,0.98)' \ --lr 0.0005 --lr-scheduler inverse_sqrt \ - --min-lr '1e-09' --warmup-updates 10000 \ + --stop-min-lr '1e-09' --warmup-updates 10000 \ --warmup-init-lr '1e-07' --label-smoothing 0.1 \ --dropout 0.3 --weight-decay 0.01 \ --decoder-learned-pos \ @@ -43,7 +43,7 @@ fairseq-train \ --share-all-embeddings \ --optimizer adam --adam-betas '(0.9,0.98)' \ --lr 0.0005 --lr-scheduler inverse_sqrt \ - --min-lr '1e-09' --warmup-updates 10000 \ + --stop-min-lr '1e-09' --warmup-updates 10000 \ --warmup-init-lr '1e-07' --label-smoothing 0.1 \ --dropout 0.3 --weight-decay 0.01 \ --decoder-learned-pos \ @@ -76,7 +76,7 @@ fairseq-train \ --share-all-embeddings \ --optimizer adam --adam-betas '(0.9,0.98)' \ --lr 0.0005 --lr-scheduler inverse_sqrt \ - --min-lr '1e-09' --warmup-updates 10000 \ + --stop-min-lr '1e-09' --warmup-updates 10000 \ --warmup-init-lr '1e-07' --label-smoothing 0.1 \ --dropout 0.3 --weight-decay 0.01 \ --decoder-learned-pos \ @@ -109,7 +109,7 @@ fairseq-train \ --share-all-embeddings \ --optimizer adam --adam-betas '(0.9,0.98)' \ --lr 0.0005 --lr-scheduler inverse_sqrt \ - --min-lr '1e-09' --warmup-updates 10000 \ + --stop-min-lr '1e-09' --warmup-updates 10000 \ --warmup-init-lr '1e-07' --label-smoothing 0.1 \ --dropout 0.3 --weight-decay 0.01 \ --decoder-learned-pos \ @@ -136,7 +136,7 @@ fairseq-train \ --share-all-embeddings \ --optimizer adam --adam-betas '(0.9,0.98)' \ --lr 0.0005 --lr-scheduler inverse_sqrt \ - --min-lr '1e-09' --warmup-updates 10000 \ + --stop-min-lr '1e-09' --warmup-updates 10000 \ --warmup-init-lr '1e-07' --label-smoothing 0.1 \ --dropout 0.3 --weight-decay 0.01 \ --decoder-learned-pos \ @@ -165,7 +165,7 @@ fairseq-train \ --share-all-embeddings \ --optimizer adam --adam-betas '(0.9,0.98)' \ --lr 0.0005 --lr-scheduler inverse_sqrt \ - --min-lr '1e-09' --warmup-updates 10000 \ + --stop-min-lr '1e-09' --warmup-updates 10000 \ --warmup-init-lr '1e-07' --label-smoothing 0.1 \ --dropout 0.3 --weight-decay 0.01 \ --decoder-learned-pos \ diff --git a/examples/pay_less_attention_paper/README.md b/examples/pay_less_attention_paper/README.md index 3fb93b23..537ca5f2 100644 --- a/examples/pay_less_attention_paper/README.md +++ b/examples/pay_less_attention_paper/README.md @@ -110,7 +110,7 @@ mkdir -p $SAVE CUDA_VISIBLE_DEVICES=0 $(which fairseq-train) data-bin/iwslt14.tokenized.de-en \ --clip-norm 0 --optimizer adam --lr 0.0005 \ --source-lang de --target-lang en --max-tokens 4000 --no-progress-bar \ - --log-interval 100 --min-lr '1e-09' --weight-decay 0.0001 \ + --log-interval 100 --stop-min-lr '1e-09' --weight-decay 0.0001 \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --lr-scheduler inverse_sqrt \ --ddp-backend=no_c10d \ @@ -137,10 +137,10 @@ python -m torch.distributed.launch --nproc_per_node 8 $(which fairseq-train) \ --max-update 30000 --share-all-embeddings --optimizer adam \ --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --weight-decay 0.0 \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ - --min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \ + --stop-min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \ --ddp-backend=no_c10d --max-tokens 3584 \ --lr-scheduler cosine --warmup-init-lr 1e-7 --warmup-updates 10000 \ - --lr-shrink 1 --max-lr 0.001 --lr 1e-7 --min-lr 1e-9 --warmup-init-lr 1e-07 \ + --lr-shrink 1 --max-lr 0.001 --lr 1e-7 --warmup-init-lr 1e-07 \ --t-mult 1 --lr-period-updates 20000 \ --arch lightconv_wmt_en_de_big --save-dir $SAVE \ --dropout 0.3 --attention-dropout 0.1 --weight-dropout 0.1 \ @@ -162,10 +162,10 @@ python -m torch.distributed.launch --nproc_per_node 8 $(which fairseq-train) \ --max-update 30000 --share-all-embeddings --optimizer adam \ --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --weight-decay 0.0 \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ - --min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \ + --stop-min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \ --ddp-backend=no_c10d --max-tokens 3584 \ --lr-scheduler cosine --warmup-init-lr 1e-7 --warmup-updates 10000 \ - --lr-shrink 1 --max-lr 0.001 --lr 1e-7 --min-lr 1e-9 --warmup-init-lr 1e-07 \ + --lr-shrink 1 --max-lr 0.001 --lr 1e-7 --warmup-init-lr 1e-07 \ --t-mult 1 --lr-period-updates 70000 \ --arch lightconv_wmt_en_fr_big --save-dir $SAVE \ --dropout 0.1 --attention-dropout 0.1 --weight-dropout 0.1 \ diff --git a/examples/quant_noise/README.md b/examples/quant_noise/README.md index 057ea620..9fe492d0 100644 --- a/examples/quant_noise/README.md +++ b/examples/quant_noise/README.md @@ -212,7 +212,7 @@ fairseq-train --task language_modeling /path/to/wikitext-103/data \ --max-tokens 3072 --tokens-per-sample 3072 --momentum 0.99 --optimizer nag \ --sample-break-mode none --update-freq 3 \ --warmup-init-lr 1e-07 --warmup-updates 16000 \ - --weight-decay 0 --seed 1 --min-lr 1e-09 \ + --weight-decay 0 --seed 1 --stop-min-lr 1e-09 \ --quant-noise-pq 0.05 --quant-noise-pq-block-size 8 ``` @@ -269,7 +269,7 @@ fairseq-train --task language_modeling /path/to/wikitext-103/data \ --ddp-backend no_c10d \ --decoder-attention-heads 8 --decoder-embed-dim 1024 --decoder-ffn-embed-dim 4096 --decoder-input-dim 1024 --decoder-layers 16 --decoder-normalize-before --decoder-output-dim 1024 \ --fp16 --keep-last-epochs -1 \ - --lr 0.0001 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 --max-lr 0.05 --min-lr 1e-09 \ + --lr 0.0001 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 --max-lr 0.05 --stop-min-lr 1e-09 \ --max-tokens 2944 --tokens-per-sample 2944\ --momentum 0.99 --no-epoch-checkpoints --no-progress-bar --optimizer nag --required-batch-size-multiple 8 \ --sample-break-mode none --t-mult 2.0 --skip-invalid-size-inputs-valid-test \ diff --git a/examples/simultaneous_translation/README.md b/examples/simultaneous_translation/README.md index e27b6528..bbc6dacd 100644 --- a/examples/simultaneous_translation/README.md +++ b/examples/simultaneous_translation/README.md @@ -23,7 +23,7 @@ fairseq-train \ --optimizer adam --adam-betas '(0.9, 0.98)' \ --lr-scheduler 'inverse_sqrt' \ --warmup-init-lr 1e-7 --warmup-updates 4000 \ - --lr 5e-4 --min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\ + --lr 5e-4 --stop-min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\ --dropout 0.3 \ --label-smoothing 0.1\ --max-tokens 3584 @@ -44,7 +44,7 @@ fairseq-train \ --optimizer adam --adam-betas '(0.9, 0.98)' \ --lr-scheduler 'inverse_sqrt' \ --warmup-init-lr 1e-7 --warmup-updates 4000 \ - --lr 5e-4 --min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\ + --lr 5e-4 --stop-min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\ --dropout 0.3 \ --label-smoothing 0.1\ --max-tokens 3584 @@ -65,7 +65,7 @@ fairseq-train \ --optimizer adam --adam-betas '(0.9, 0.98)' \ --lr-scheduler 'inverse_sqrt' \ --warmup-init-lr 1e-7 --warmup-updates 4000 \ - --lr 5e-4 --min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\ + --lr 5e-4 --stop-min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\ --dropout 0.3 \ --label-smoothing 0.1\ --max-tokens 3584 diff --git a/examples/translation/README.md b/examples/translation/README.md index 3eb8e013..7b1fcc8d 100644 --- a/examples/translation/README.md +++ b/examples/translation/README.md @@ -268,7 +268,7 @@ CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/iwslt17.de_fr.en.bpe16k/ \ --arch multilingual_transformer_iwslt_de_en \ --share-decoders --share-decoder-input-output-embed \ --optimizer adam --adam-betas '(0.9, 0.98)' \ - --lr 0.0005 --lr-scheduler inverse_sqrt --min-lr '1e-09' \ + --lr 0.0005 --lr-scheduler inverse_sqrt \ --warmup-updates 4000 --warmup-init-lr '1e-07' \ --label-smoothing 0.1 --criterion label_smoothed_cross_entropy \ --dropout 0.3 --weight-decay 0.0001 \ diff --git a/examples/translation_moe/README.md b/examples/translation_moe/README.md index ef7abdb4..3cc3fb46 100644 --- a/examples/translation_moe/README.md +++ b/examples/translation_moe/README.md @@ -24,7 +24,7 @@ fairseq-train --ddp-backend='no_c10d' \ --arch transformer_wmt_en_de --share-all-embeddings \ --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \ --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 \ - --lr 0.0007 --min-lr 1e-09 \ + --lr 0.0007 \ --dropout 0.1 --weight-decay 0.0 --criterion cross_entropy \ --max-tokens 3584 ``` diff --git a/examples/wav2vec/README.md b/examples/wav2vec/README.md index 52dce362..bf501ab9 100644 --- a/examples/wav2vec/README.md +++ b/examples/wav2vec/README.md @@ -186,7 +186,7 @@ $ python examples/wav2vec/wav2vec_manifest.py /path/to/waves --dest /manifest/pa ``` $ python train.py /manifest/path --save-dir /model/path --num-workers 6 --fp16 --max-update 400000 --save-interval 1 --no-epoch-checkpoints \ ---arch wav2vec --task audio_pretraining --lr 1e-06 --min-lr 1e-09 --optimizer adam --max-lr 0.005 --lr-scheduler cosine \ +--arch wav2vec --task audio_pretraining --lr 1e-06 --stop-min-lr 1e-09 --optimizer adam --max-lr 0.005 --lr-scheduler cosine \ --conv-feature-layers [(512, 10, 5), (512, 8, 4), (512, 4, 2), (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1)] \ --conv-aggregator-layers [(512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)] \ --skip-connections-agg --residual-scale 0.5 --log-compression --warmup-updates 500 --warmup-init-lr 1e-07 --criterion wav2vec --num-negatives 10 \ @@ -244,7 +244,7 @@ $ python examples/wav2vec/wav2vec_manifest.py /path/to/waves --dest /manifest/pa ``` $ python train.py /manifest/path --save-dir /model/path --num-workers 6 --fp16 --max-update 400000 \ ---save-interval 1 --no-epoch-checkpoints --arch wav2vec --task audio_pretraining --lr 1e-06 --min-lr 1e-09 \ +--save-interval 1 --no-epoch-checkpoints --arch wav2vec --task audio_pretraining --lr 1e-06 --stop-min-lr 1e-09 \ --optimizer adam --max-lr 1e-05 --lr-scheduler cosine \ --conv-feature-layers [(512, 10, 5), (512, 8, 4), (512, 4, 2), (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1), (512, 1, 1)] \ --conv-aggregator-layers [(512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)] \ diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py index 235c660a..f03875da 100644 --- a/fairseq/checkpoint_utils.py +++ b/fairseq/checkpoint_utils.py @@ -458,7 +458,6 @@ def _upgrade_state_dict(state): "iterations_in_epoch": state["extra_state"].get("batch_offset", 0), } - # old model checkpoints may not have separate source/target positions # backward compatibility, cfg updates if "args" in state and state["args"] is not None: # default to translation task @@ -474,15 +473,20 @@ def _upgrade_state_dict(state): state["extra_state"]["train_iterator"]["epoch"] = max( state["extra_state"]["train_iterator"].get("epoch", 1), 1 ) - + # --remove-bpe ==> --postprocess if hasattr(state["args"], "remove_bpe"): state["args"].post_process = state["args"].remove_bpe + # --min-lr ==> --stop-min-lr + if hasattr(state["args"], "min_lr"): + state["args"].stop_min_lr = state["args"].min_lr + del state["args"].min_lr state["cfg"] = convert_namespace_to_omegaconf(state["args"]) if "cfg" in state and state["cfg"] is not None: with open_dict(state["cfg"]): if state["cfg"].task is not None: + # old model checkpoints may not have separate source/target positions if hasattr(state["cfg"].task, "max_positions") and not hasattr( state["cfg"].task, "max_source_positions" ): diff --git a/fairseq/dataclass/configs.py b/fairseq/dataclass/configs.py index 3ff177d9..3992e3c2 100644 --- a/fairseq/dataclass/configs.py +++ b/fairseq/dataclass/configs.py @@ -465,7 +465,7 @@ class OptimizationConfig(FairseqDataclass): " (note: this may be interpreted differently depending on --lr-scheduler)" }, ) - min_lr: float = field( + stop_min_lr: float = field( default=-1.0, metadata={"help": "stop training when the learning rate reaches this minimum"}, ) diff --git a/fairseq_cli/train.py b/fairseq_cli/train.py index 77397596..82c30321 100644 --- a/fairseq_cli/train.py +++ b/fairseq_cli/train.py @@ -125,7 +125,15 @@ def main(cfg: DictConfig) -> None: lr = trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() - while lr > cfg.optimization.min_lr and epoch_itr.next_epoch_idx <= max_epoch: + while epoch_itr.next_epoch_idx <= max_epoch: + if lr <= cfg.optimization.stop_min_lr: + logger.info( + f"stopping training because current learning rate ({lr}) is smaller " + "than or equal to minimum learning rate " + f"(--stop-min-lr={cfg.optimization.stop_min_lr})" + ) + break + # train for one epoch valid_losses, should_stop = train(cfg, trainer, task, epoch_itr) if should_stop: diff --git a/tests/test_binaries.py b/tests/test_binaries.py index cad6f1eb..58f86484 100644 --- a/tests/test_binaries.py +++ b/tests/test_binaries.py @@ -1454,7 +1454,7 @@ def train_legacy_masked_language_model(data_dir, arch, extra_args=()): "0.5", "--lr", "0.0001", - "--min-lr", + "--stop-min-lr", "1e-09", # dropout, attention args "--dropout",