mirror of
https://github.com/marian-nmt/marian.git
synced 2024-11-27 10:33:14 +03:00
check last cuda error after running every kernel. Program may not be compiled for the particular GPU or shared mem incorrectly set
This commit is contained in:
parent
736e18ad5f
commit
79de7a566e
@ -223,6 +223,7 @@ class FastGRU: public Cell {
|
||||
gElementwiseOps<<<blocks, threads, 0, mblas::CudaStreamHandler::GetStream()>>>
|
||||
(nextWrap, stateWrap, ruhWrap, tempWrap,
|
||||
bWrap, bx1Wrap, bx2Wrap);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
|
||||
//PAUSE_TIMER("ElementwiseOps");
|
||||
|
||||
|
@ -78,6 +78,7 @@ void Mean(Matrix& Out,
|
||||
|
||||
gMean<<<blocks, threads, 0, CudaStreamHandler::GetStream()>>>
|
||||
(Out, In, sentenceLengthsWrap);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
|
||||
}
|
||||
|
||||
@ -127,6 +128,7 @@ void WeightedMean(Matrix& out,const Matrix& weights, const Matrix& in, const mbl
|
||||
*/
|
||||
gWeightedMean<<<nBlocks, nThreads, 0, CudaStreamHandler::GetStream()>>>
|
||||
(out, weights, in, hypo2Batch);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
}
|
||||
|
||||
Matrix& Transpose(Matrix& Out, const Matrix& In) {
|
||||
@ -198,6 +200,7 @@ void PasteRows(Matrix& Out, const Matrix& In, const unsigned rowNo, unsigned col
|
||||
|
||||
gPasteRows<<<nBlocks, nThreads, 0, CudaStreamHandler::GetStream()>>>
|
||||
(outWrap, inWrap, rowNo, colNo);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
|
||||
}
|
||||
|
||||
@ -271,6 +274,7 @@ Matrix& CopyRows(Matrix& out,
|
||||
|
||||
gCopyRows<<<blocks, threads, 0, CudaStreamHandler::GetStream()>>>
|
||||
(out, in, inRows);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
|
||||
return out;
|
||||
}
|
||||
@ -326,6 +330,7 @@ Matrix& CopyRows(Matrix& out,
|
||||
|
||||
gCopyRows<<<blocks, threads, 0, CudaStreamHandler::GetStream()>>>
|
||||
(out, in, inRows, outRows, shape);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
}
|
||||
|
||||
return out;
|
||||
@ -399,6 +404,8 @@ Matrix& Slice(Matrix& Out,
|
||||
|
||||
gSlice<<<blocks, threads, 0, CudaStreamHandler::GetStream()>>>
|
||||
(outWrap, inWrap, n, dim);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
|
||||
return Out;
|
||||
}
|
||||
|
||||
@ -583,6 +590,7 @@ Matrix& Softmax(Matrix& Out,
|
||||
|
||||
gSoftMax<<<blocks, threads, shared, CudaStreamHandler::GetStream()>>>
|
||||
(outWrap, hypo2BatchWrap, sentenceLengthsWrap, threads);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
|
||||
return Out;
|
||||
}
|
||||
@ -677,6 +685,7 @@ Matrix& LogSoftmax(Matrix& Out)
|
||||
|
||||
gLogSoftMax<<<blocks, threads, shared, CudaStreamHandler::GetStream()>>>
|
||||
(Out, threads);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
|
||||
return Out;
|
||||
}
|
||||
@ -700,6 +709,7 @@ void SetColumn(Matrix& In, unsigned noColumn, float value) {
|
||||
|
||||
gSetColumn<<<nBlocks, nThreads, 0, mblas::CudaStreamHandler::GetStream()>>>
|
||||
(inWrap, noColumn, value);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -721,6 +731,7 @@ void Fill(Matrix& In, float value) {
|
||||
|
||||
gFill<<<nBlocks, nThreads, 0, CudaStreamHandler::GetStream()>>>
|
||||
(inWrap, value);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
}
|
||||
else {
|
||||
HANDLE_ERROR(cudaMemsetAsync(In.data(), 0, size * sizeof(float), CudaStreamHandler::GetStream()));
|
||||
@ -756,6 +767,7 @@ void Fill0(Matrix& in, float value, const mblas::Vector<unsigned> &d_newHypoIds)
|
||||
|
||||
gFill0<<<nBlocks, nThreads, 0, CudaStreamHandler::GetStream()>>>
|
||||
(in, value, d_newHypoIds, shape);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@ -793,6 +805,7 @@ void MapMatrix(Matrix& state,
|
||||
|
||||
gMapMatrix<<<numBlocks, numThreads, 0, CudaStreamHandler::GetStream()>>>
|
||||
(stateWrap, sentenceLengthsWrap, i);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
|
||||
/*
|
||||
cerr << "nBlocks=" << numBlocks << endl;
|
||||
@ -913,6 +926,7 @@ void Normalization(Matrix &out,
|
||||
|
||||
gLNormalization<<<numBlocks, numThreads, shared, CudaStreamHandler::GetStream()>>>
|
||||
(outWrap, inWrap, alphaWrap, *betaWrap, eps);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
|
||||
/*
|
||||
//std::cerr << "nBlocks=" << numBlocks << std::endl;
|
||||
@ -1412,6 +1426,8 @@ void LogSoftmaxAndNBest(mblas::Vector<NthOutBatch> &nBest,
|
||||
d_isFirsts,
|
||||
d_beamSizes
|
||||
);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
|
||||
//PAUSE_TIMER("gBeamSizeInit");
|
||||
|
||||
unsigned blocks = std::min(MAX_BLOCKS, numHypos);
|
||||
@ -1428,6 +1444,8 @@ void LogSoftmaxAndNBest(mblas::Vector<NthOutBatch> &nBest,
|
||||
forbidUNK,
|
||||
hypo2BeamSize,
|
||||
hypo2Candidate);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
|
||||
//PAUSE_TIMER("gLogSoftMax");
|
||||
|
||||
//HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream()));
|
||||
@ -1455,6 +1473,8 @@ void LogSoftmaxAndNBest(mblas::Vector<NthOutBatch> &nBest,
|
||||
activeBatch2Hypo,
|
||||
hypo2Candidate,
|
||||
hypo2NextHypo);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
|
||||
//PAUSE_TIMER("gNBestPerBatch");
|
||||
PAUSE_TIMER("LogSoftmaxAndNBest");
|
||||
}
|
||||
@ -1487,6 +1507,7 @@ void UpdateSentenceLengths(const mblas::Vector<unsigned> &d_newBatchIds,
|
||||
unsigned threads = std::min(MAX_THREADS, d_newSentenceLengths.size());
|
||||
|
||||
gUpdateSentenceLengths<<<blocks, threads, 0, CudaStreamHandler::GetStream()>>>(d_newSentenceLengths, d_newBatchIds, sentenceLengths);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -1541,6 +1562,7 @@ void AddNewSourceContext(mblas::Matrix &matrix,
|
||||
unsigned blocks = size / threads + ((size % threads == 0) ? 0 : 1);
|
||||
|
||||
gAddNewData3<<<blocks, threads, 0, CudaStreamHandler::GetStream()>>>(matrix, newMatrix, batchId, newSentenceOffset, shape);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
}
|
||||
|
||||
//PAUSE_TIMER("AddNewSourceContext");
|
||||
@ -1580,6 +1602,7 @@ void AddNewSCU(mblas::Matrix &matrix,
|
||||
unsigned blocks = size / threads + ((size % threads == 0) ? 0 : 1);
|
||||
|
||||
gAddNewData3<<<blocks, threads, 0, CudaStreamHandler::GetStream()>>>(matrix, newMatrix, batchId, newSentenceOffset, shape);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
}
|
||||
|
||||
//PAUSE_TIMER("AddNewSCU");
|
||||
@ -1615,6 +1638,7 @@ void AddNewStates(mblas::Matrix& matrix,
|
||||
unsigned blocks = size / threads + ((size % threads == 0) ? 0 : 1);
|
||||
|
||||
gAddNewData0<<<blocks, threads, 0, CudaStreamHandler::GetStream()>>>(matrix, newMatrix, hypoId, newSentenceOffset);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
|
||||
}
|
||||
|
||||
|
@ -197,8 +197,7 @@ Matrix& Broadcast(Functor functor,
|
||||
*/
|
||||
gBroadcast<<<blocks, threads, 0, CudaStreamHandler::GetStream()>>>
|
||||
(functor, out, in1, in2, hypo2Batch);
|
||||
|
||||
HANDLE_ERROR(cudaDeviceSynchronize());
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
|
||||
PAUSE_TIMER("Broadcast");
|
||||
return out;
|
||||
@ -244,6 +243,7 @@ Matrix& BroadcastVecColumn(Functor functor, Matrix& Out, const mblas::Vector<flo
|
||||
|
||||
gBroadcastVecColumn<<<blocks, threads, rows * sizeof(float), CudaStreamHandler::GetStream()>>>
|
||||
(functor, outWrap, inWrap);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
|
||||
return Out;
|
||||
}
|
||||
@ -288,6 +288,7 @@ Matrix& BroadcastVec(Functor functor, Matrix& Out, const Matrix& In)
|
||||
|
||||
gBroadcastVec<<<blocks, threads, 0, stream>>>
|
||||
(functor, outWrap, inWrap);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
|
||||
return Out;
|
||||
}
|
||||
@ -315,6 +316,7 @@ Matrix& Element(Functor functor,
|
||||
|
||||
gElement<<<blocks, threads, 0, stream>>>
|
||||
(functor, outWrap);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
|
||||
return Out;
|
||||
}
|
||||
@ -346,6 +348,7 @@ Matrix& Element(Functor functor,
|
||||
|
||||
gElement<<<blocks, threads, 0, stream>>>
|
||||
(functor, outWrap, inWrap);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
|
||||
return Out;
|
||||
}
|
||||
@ -389,6 +392,7 @@ Matrix& Element(Functor functor,
|
||||
|
||||
gElement<<<blocks, threads, 0, stream>>>
|
||||
(functor, outWrap, in1Wrap, in2Wrap);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
|
||||
//HANDLE_ERROR( cudaPeekAtLastError() );
|
||||
//HANDLE_ERROR( cudaDeviceSynchronize() );
|
||||
@ -464,6 +468,7 @@ void CopyMatrix3(TMatrix<T> &out,
|
||||
const MatrixWrapper<T> inWrap(in);
|
||||
|
||||
gCopyMatrix3<<<blocks, threads, 0, CudaStreamHandler::GetStream()>>>(outWrap, inWrap, smallestShape, d_oldBatchIds);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -119,6 +119,7 @@ void NthElement::getNBestList(mblas::Matrix &probs,
|
||||
|
||||
gMaxElement<<<numBlocks, BLOCK_SIZE, BLOCK_SIZE * sizeof(float), mblas::CudaStreamHandler::GetStream()>>>
|
||||
(outWrap, probsWrap, batchPositionWrap, numBatches);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
|
||||
gMaxElementUpdate<<<numBatches, BLOCK_SIZE, BLOCK_SIZE * sizeof(float), mblas::CudaStreamHandler::GetStream()>>>
|
||||
(outWrap,
|
||||
@ -127,6 +128,7 @@ void NthElement::getNBestList(mblas::Matrix &probs,
|
||||
batchPositionWrap,
|
||||
cumBeamSizesWrap,
|
||||
numBlocks);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
|
||||
/*
|
||||
cerr << "numBlocks=" << numBlocks << endl;
|
||||
|
@ -38,6 +38,8 @@ T Sum(const T *data, unsigned count)
|
||||
|
||||
HANDLE_ERROR( cudaStreamSynchronize(stream));
|
||||
gSum<<<1, 1, 0, stream>>>(data, count, *d_ret);
|
||||
HANDLE_ERROR(cudaGetLastError());
|
||||
|
||||
HANDLE_ERROR( cudaMemcpyAsync(&ret, d_ret, sizeof(T), cudaMemcpyDeviceToHost, stream) );
|
||||
|
||||
HANDLE_ERROR( cudaStreamSynchronize(stream));
|
||||
|
Loading…
Reference in New Issue
Block a user