diff --git a/CMakeLists.txt b/CMakeLists.txt index 1effd778..6009bcc4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,8 +2,8 @@ cmake_minimum_required(VERSION 3.5.1) set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) project(amunn CXX) -SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O0 -funroll-loops -Wno-unused-result -Wno-deprecated") -LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O0; -arch=sm_35; -lineinfo; --use_fast_math;) +SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -funroll-loops -Wno-unused-result -Wno-deprecated") +LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O3; -arch=sm_35; -lineinfo; --use_fast_math;) add_definitions(-DCUDA_API_PER_THREAD_DEFAULT_STREAM) SET(CUDA_PROPAGATE_HOST_FLAGS OFF) diff --git a/src/decoder/search.h b/src/decoder/search.h index be4b2ad8..e05a284c 100644 --- a/src/decoder/search.h +++ b/src/decoder/search.h @@ -12,6 +12,12 @@ class Search { std::vector scorers_; using Matrix = typename Backend::Payload; + + template + using DeviceVector = typename Backend::DeviceVector; + + template + using HostVector = typename Backend::HostVector; public: Search(size_t threadId) @@ -90,30 +96,31 @@ class Search { Matrix& probs = probsEnsemble[0]; - Matrix costs(probs.Rows(), 1); + Matrix costs; + (*costs).Resize((*probs).Rows(), 1); HostVector vCosts; for(auto& h : prevHyps) vCosts.push_back(h->GetCost()); Backend::copy(vCosts.begin(), vCosts.end(), costs.begin()); - Backend::BroadcastVecColumn(weights[0] * Backend::_1 + Backend::_2, - probs, costs); + Backend::Broadcast(weights[0] * Backend::_1 + Backend::_2, + probs, costs); for(size_t i = 0; i < probsEnsemble.size(); ++i) Backend::Element(Backend::_1 + weights[i] * Backend::_2, probs, probsEnsemble[i]); - Backend::HostVector bestKeys(beamSize); - Backend::HostVector bestCosts(beamSize); + HostVector bestKeys(beamSize); + HostVector bestCosts(beamSize); Backend::PartialSortByKey(probs, bestKeys, bestCosts); - std::vector> breakDowns; + std::vector> breakDowns; bool doBreakdown = God::Get("n-best"); if(doBreakdown) { breakDowns.push_back(bestCosts); for(size_t i = 1; i < probsEnsemble.size(); ++i) { HostVector modelCosts(beamSize); - auto it = Backend::make_permutation_iterator(probsEnsemble[i].begin(), keys.begin()); + auto it = Backend::make_permutation_iterator(probsEnsemble[i].begin(), bestKeys.begin()); Backend::copy(it, it + beamSize, modelCosts.begin()); breakDowns.push_back(modelCosts); } @@ -136,7 +143,7 @@ class Search { float cost = 0; if(j < probsEnsemble.size()) { if(prevHyps[hypIndex]->GetCostBreakdown().size() < probsEnsemble.size()) - const_cast(prevHyps[hypIndex])->GetCostBreakdown().resize(ProbsEnsemble.size(), 0.0); + const_cast(prevHyps[hypIndex])->GetCostBreakdown().resize(probsEnsemble.size(), 0.0); cost = breakDowns[j][i] + const_cast(prevHyps[hypIndex])->GetCostBreakdown()[j]; } sum += weights[j] * cost;