mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-07-14 15:40:33 +03:00
allas storage commands
This commit is contained in:
parent
dbcdab4d6b
commit
08ed366914
19
Makefile
19
Makefile
@ -378,3 +378,22 @@ train-and-start-bt-jobs: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
|
||||
${MAKE} local-dist
|
||||
${MAKE} -C backtranslate MODELHOME=${MODELDIR} translate-all-wikis-jobs
|
||||
|
||||
|
||||
ALL_RELEASED_MODELS = ${wildcard models-tatoeba/*/*.zip}
|
||||
ALL_VOCABS_FIXED = ${patsubst %.zip,%.fixed-vocab,${ALL_RELEASED_MODELS}}
|
||||
|
||||
fix-released-vocabs: ${ALL_VOCABS_FIXED}
|
||||
|
||||
%.fixed-vocab: %.zip
|
||||
@( v=`unzip -l $< | grep 'vocab.yml$$' | sed 's/^.* //'`; \
|
||||
if [ "$$v" != "" ]; then \
|
||||
unzip $< $$v; \
|
||||
python3 scripts/fix_vocab.py $$v; \
|
||||
if [ -e $$v.bak ]; then \
|
||||
echo "update $$v in $<"; \
|
||||
zip $< $$v $$v.bak; \
|
||||
else \
|
||||
echo "vocab $$v is fine in $<"; \
|
||||
fi; \
|
||||
rm -f $$v $$v.bak; \
|
||||
fi )
|
||||
|
41
lib/allas.mk
41
lib/allas.mk
@ -20,6 +20,9 @@ WORK_DESTDIR ?= ${WORKHOME}
|
||||
WORK_CONTAINER ?= OPUS-MT-train_${notdir ${WORKHOME}}-${WHOAMI}
|
||||
WORK_CONTAINER_JT ?= OPUS-MT-train_${notdir ${WORKHOME}}-tiedeman
|
||||
|
||||
ALLAS_STORAGE_URL = https://object.pouta.csc.fi/
|
||||
|
||||
|
||||
## store workdir on allas
|
||||
store:
|
||||
cd ${WORK_SRCDIR} && a-put -b ${WORK_CONTAINER} --nc --follow-links --override ${LANGPAIRSTR}
|
||||
@ -43,3 +46,41 @@ fetch-data:
|
||||
mkdir -p ${WORK_DESTDIR}
|
||||
cd ${WORK_DESTDIR} && a-get ${WORK_CONTAINER}/data.tar
|
||||
|
||||
|
||||
|
||||
## generic recipe for storing work data and removing it from the file system
|
||||
## DANGEROUS --- this really deletes the data!
|
||||
## NOTE: makes container also world-readable (see swift post command)
|
||||
## --> this makes it easier to fetch things without login credentials
|
||||
## --> should not store sensitive data here!
|
||||
%.stored: %
|
||||
if [ "$(firstword $(subst -, ,$(subst /, ,$@)))" == "work" ]; then \
|
||||
b=OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}; \
|
||||
cd $(dir $@); \
|
||||
a-put -b $$b --nc --follow-links --override $(notdir $<); \
|
||||
rm -fr $(notdir $<); \
|
||||
touch $(notdir $@); \
|
||||
rm -f $(notdir $(@:stored=.fetched)); \
|
||||
swift post $$b --read-acl ".r:*"
|
||||
fi
|
||||
|
||||
|
||||
## TODO: fetch with wget instead of using a-commands
|
||||
## fetch work data from allas
|
||||
%.fetched:
|
||||
if [ "$(firstword $(subst -, ,$(subst /, ,$@)))" == "work" ]; then \
|
||||
cd $(dir $@); \
|
||||
a-get OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}/$(notdir $(@:.fetched=.tar)); \
|
||||
touch $(notdir $@); \
|
||||
rm -f $(notdir $(@:fetched=.stored)); \
|
||||
fi
|
||||
|
||||
## another way of fetching work data
|
||||
## requires settings SRCLANGS and TRGLANGS (or LANGPAIRSTR directly)
|
||||
work-%/${LANGPAIRSTR}:
|
||||
mkdir -p $(dir $@)
|
||||
cd $(dir $@) && a-get OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}/${LANGPAIRSTR}.tar
|
||||
|
||||
|
||||
UPLOAD_MODELS=$(patsubst %,%.stored,${wildcard work-tatoeba/[dg-rt-z]*})
|
||||
upload-workfiles: ${UPLOAD_MODELS}
|
||||
|
@ -10,7 +10,7 @@ filename = sys.argv[1]
|
||||
|
||||
try:
|
||||
input = open(filename, 'r')
|
||||
yaml.load(input)
|
||||
yaml.safe_load(input)
|
||||
except:
|
||||
print('YAML file is broken - try to fix it!')
|
||||
print(f'copy {filename} to {filename}.bak')
|
||||
|
Loading…
Reference in New Issue
Block a user