now includes the MPI rank in each log msg; bug fix: fixed a few compilation problems on Windows

This commit is contained in:
Frank Seide 2018-10-05 14:38:12 -07:00
parent bc63e72680
commit 1057841140
12 changed files with 59 additions and 10 deletions

View File

@ -122,6 +122,15 @@ void createLoggers(const marian::Config* options) {
#endif
}
// modify the log pattern for the "general" logger to include the MPI rank
// This is called upon initializing MPI. It is needed to associated error messages to ranks.
void switchtoMultinodeLogging(std::string nodeIdStr) {
Logger log = spdlog::get("general");
if (log)
log->set_pattern("[%Y-%m-%d %T " + nodeIdStr + "] %v");
}
namespace marian {
void noinline logCallStack(size_t skipLevels)
{

View File

@ -110,3 +110,4 @@ void checkedLog(std::string logger, std::string level, Args... args) {
}
void createLoggers(const marian::Config* options = nullptr);
void switchtoMultinodeLogging(std::string nodeIdStr);

View File

@ -490,7 +490,7 @@ struct RowsNodeOp : public NaryNodeOp {
struct SelectNodeOp : public NaryNodeOp {
SelectNodeOp(Expr a, Expr indices, int axis)
: NaryNodeOp({a, indices}, newShape(a, axis, indices->shape().elements())),
axis_{a->shape().axis(axis)} {
axis_(a->shape().axis(axis)) {
matchOrAbort<IndexType>(indices->value_type());
}

2
src/layers/word2vec_reader.h Normal file → Executable file
View File

@ -56,7 +56,7 @@ public:
embs.reserve(dimVoc * dimEmb);
// Populate output vector with embedding
for(size_t word = 0; word < (size_t)dimVoc; ++word) {
for(Word word = 0; word < (Word)dimVoc; ++word) {
// For words not occuring in the file use uniform distribution
if(word2vec.find(word) == word2vec.end()) {
auto randVals = randomEmbeddings(dimVoc, dimEmb);

View File

@ -293,7 +293,7 @@ public:
if(specialSymbols_.count(embIdx[i])) {
stateHardAtt->getAttentionIndices()[i]++;
if(stateHardAtt->getAttentionIndices()[i] >= dimSrcWords)
stateHardAtt->getAttentionIndices()[i] = dimSrcWords - 1;
stateHardAtt->getAttentionIndices()[i] = (unsigned int)dimSrcWords - 1;
}
}
}

2
src/models/transformer_factory.h Normal file → Executable file
View File

@ -13,4 +13,6 @@ Ptr<EncoderBase> NewEncoderTransformer(Ptr<Options> options);
Ptr<DecoderBase> NewDecoderTransformer(Ptr<Options> options);
} // namespace marian
#ifndef _MSC_VER
#include "models/transformer.h"
#endif

View File

@ -517,7 +517,7 @@ void Insert(Tensor out,
int length = inShape.elements();
functional::Array<int, functional::Shape::size()> dims;
int axisCPU = axis + functional::Shape::size() - out->shape().size();
int axisCPU = (int)(axis + functional::Shape::size() - out->shape().size());
for(int index = 0; index < length; ++index) {
inShape.dims(index, dims);

View File

@ -80,14 +80,16 @@ public:
MPI_Comm_size(MPI_COMM_WORLD, &comm_world_size_);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank_);
// patch logging pattern to include the MPI process info
switchtoMultinodeLogging(std::to_string(MPIWrapper::myMPIRank()));
// log hostnames in order, and test
// @TODO: We call ourselves here. Not sure if that is properly allowed.
for (size_t r = 0; r < numMPIProcesses(); r++) {
MPIWrapper::barrier();
if (r == MPIWrapper::myMPIRank())
LOG(info, "[mpi] initialized as {}", MPIWrapper::idStr());
LOG(info, "[mpi] initialized {} processes", MPIWrapper::numMPIProcesses());
MPIWrapper::barrier();
}
MPIWrapper::barrier();
}
virtual size_t myMPIRank() const override { return (size_t)my_rank_; };

View File

@ -81,7 +81,11 @@ public:
v.resize(vecLen);
bCast(v.data(), v.size(), getDataType(v.data()), rootRank, comm);
}
std::string idStr() { // helper to identify the node in logs
std::string idStr() const { // helper to identify the node in logs
return hostnameAndProcessId() + " MPI rank " + std::to_string(myMPIRank()) + " out of " + std::to_string(numMPIProcesses());
}
protected:
static std::string hostnameAndProcessId() { // helper to get hostname:pid
#ifdef _WIN32
std::string hostname = getenv("COMPUTERNAME");
auto processId = GetCurrentProcessId();
@ -93,7 +97,7 @@ public:
}();
auto processId = getpid();
#endif
return hostname + ":" + std::to_string(processId) + " MPI rank " + std::to_string(myMPIRank()) + " out of " + std::to_string(numMPIProcesses());
return hostname + ":" + std::to_string(processId);
}
};

1
src/training/graph_group_singleton.cpp Normal file → Executable file
View File

@ -17,7 +17,6 @@ void SingletonGraph::execute(Ptr<data::Batch> batch) {
graph_->backward();
// Get batch stats
size_t batch_words = batch->wordsTrg();
opt_->update(graph_);
if(mvAvg_) {

View File

@ -123,6 +123,8 @@
<ClInclude Include="..\src\command\marian_conv.cpp" />
<ClCompile Include="..\src\command\marian_main.cpp" />
<ClCompile Include="..\src\common\binary.cpp" />
<ClCompile Include="..\src\common\cli_wrapper.cpp" />
<ClCompile Include="..\src\common\config_validator.cpp" />
<ClCompile Include="..\src\common\io.cpp" />
<ClCompile Include="..\src\common\utils.cpp" />
<ClCompile Include="..\src\common\logging.cpp" />
@ -202,9 +204,15 @@
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\src\common\binary.h" />
<ClInclude Include="..\src\common\cli_helper.h" />
<ClInclude Include="..\src\common\cli_wrapper.h" />
<ClInclude Include="..\src\common\config_validator.h" />
<ClInclude Include="..\src\common\filesystem.h" />
<ClInclude Include="..\src\common\hash.h" />
<ClInclude Include="..\src\common\io.h" />
<ClInclude Include="..\src\common\io_item.h" />
<ClInclude Include="..\src\common\timer.h" />
<ClInclude Include="..\src\common\types.h" />
<ClInclude Include="..\src\layers\loss.h" />
<ClInclude Include="..\src\layers\weight.h" />
<ClInclude Include="..\src\marian.h" />

View File

@ -247,6 +247,12 @@
<ClCompile Include="..\src\3rd_party\ExceptionWithCallStack.cpp">
<Filter>3rd_party</Filter>
</ClCompile>
<ClCompile Include="..\src\common\cli_wrapper.cpp">
<Filter>common</Filter>
</ClCompile>
<ClCompile Include="..\src\common\config_validator.cpp">
<Filter>common</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\src\marian.h" />
@ -1043,6 +1049,24 @@
<ClInclude Include="..\src\common\timer.h">
<Filter>common</Filter>
</ClInclude>
<ClInclude Include="..\src\common\cli_helper.h">
<Filter>common</Filter>
</ClInclude>
<ClInclude Include="..\src\common\cli_wrapper.h">
<Filter>common</Filter>
</ClInclude>
<ClInclude Include="..\src\common\config_validator.h">
<Filter>common</Filter>
</ClInclude>
<ClInclude Include="..\src\common\filesystem.h">
<Filter>common</Filter>
</ClInclude>
<ClInclude Include="..\src\common\hash.h">
<Filter>common</Filter>
</ClInclude>
<ClInclude Include="..\src\common\types.h">
<Filter>common</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<Filter Include="3rd_party">