From 08ed3669145580ccd5e0d0ee64005d4600e083a5 Mon Sep 17 00:00:00 2001 From: Joerg Tiedemann Date: Thu, 4 Nov 2021 09:57:48 +0200 Subject: [PATCH] allas storage commands --- Makefile | 19 +++++++++++++++++++ lib/allas.mk | 41 +++++++++++++++++++++++++++++++++++++++++ scripts/fix_vocab.py | 2 +- 3 files changed, 61 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5c4214f9..ebdb50ba 100644 --- a/Makefile +++ b/Makefile @@ -378,3 +378,22 @@ train-and-start-bt-jobs: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done ${MAKE} local-dist ${MAKE} -C backtranslate MODELHOME=${MODELDIR} translate-all-wikis-jobs + +ALL_RELEASED_MODELS = ${wildcard models-tatoeba/*/*.zip} +ALL_VOCABS_FIXED = ${patsubst %.zip,%.fixed-vocab,${ALL_RELEASED_MODELS}} + +fix-released-vocabs: ${ALL_VOCABS_FIXED} + +%.fixed-vocab: %.zip + @( v=`unzip -l $< | grep 'vocab.yml$$' | sed 's/^.* //'`; \ + if [ "$$v" != "" ]; then \ + unzip $< $$v; \ + python3 scripts/fix_vocab.py $$v; \ + if [ -e $$v.bak ]; then \ + echo "update $$v in $<"; \ + zip $< $$v $$v.bak; \ + else \ + echo "vocab $$v is fine in $<"; \ + fi; \ + rm -f $$v $$v.bak; \ + fi ) diff --git a/lib/allas.mk b/lib/allas.mk index 8b912b21..b28a377a 100644 --- a/lib/allas.mk +++ b/lib/allas.mk @@ -20,6 +20,9 @@ WORK_DESTDIR ?= ${WORKHOME} WORK_CONTAINER ?= OPUS-MT-train_${notdir ${WORKHOME}}-${WHOAMI} WORK_CONTAINER_JT ?= OPUS-MT-train_${notdir ${WORKHOME}}-tiedeman +ALLAS_STORAGE_URL = https://object.pouta.csc.fi/ + + ## store workdir on allas store: cd ${WORK_SRCDIR} && a-put -b ${WORK_CONTAINER} --nc --follow-links --override ${LANGPAIRSTR} @@ -43,3 +46,41 @@ fetch-data: mkdir -p ${WORK_DESTDIR} cd ${WORK_DESTDIR} && a-get ${WORK_CONTAINER}/data.tar + + +## generic recipe for storing work data and removing it from the file system +## DANGEROUS --- this really deletes the data! +## NOTE: makes container also world-readable (see swift post command) +## --> this makes it easier to fetch things without login credentials +## --> should not store sensitive data here! +%.stored: % + if [ "$(firstword $(subst -, ,$(subst /, ,$@)))" == "work" ]; then \ + b=OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}; \ + cd $(dir $@); \ + a-put -b $$b --nc --follow-links --override $(notdir $<); \ + rm -fr $(notdir $<); \ + touch $(notdir $@); \ + rm -f $(notdir $(@:stored=.fetched)); \ + swift post $$b --read-acl ".r:*" + fi + + +## TODO: fetch with wget instead of using a-commands +## fetch work data from allas +%.fetched: + if [ "$(firstword $(subst -, ,$(subst /, ,$@)))" == "work" ]; then \ + cd $(dir $@); \ + a-get OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}/$(notdir $(@:.fetched=.tar)); \ + touch $(notdir $@); \ + rm -f $(notdir $(@:fetched=.stored)); \ + fi + +## another way of fetching work data +## requires settings SRCLANGS and TRGLANGS (or LANGPAIRSTR directly) +work-%/${LANGPAIRSTR}: + mkdir -p $(dir $@) + cd $(dir $@) && a-get OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}/${LANGPAIRSTR}.tar + + +UPLOAD_MODELS=$(patsubst %,%.stored,${wildcard work-tatoeba/[dg-rt-z]*}) +upload-workfiles: ${UPLOAD_MODELS} diff --git a/scripts/fix_vocab.py b/scripts/fix_vocab.py index c6965880..5e34d0fb 100755 --- a/scripts/fix_vocab.py +++ b/scripts/fix_vocab.py @@ -10,7 +10,7 @@ filename = sys.argv[1] try: input = open(filename, 'r') - yaml.load(input) + yaml.safe_load(input) except: print('YAML file is broken - try to fix it!') print(f'copy {filename} to {filename}.bak')