add legacy code on gpu

2024-09-17 09:47:34 +03:00 · 2021-06-07 11:25:40 -07:00 · 2021-06-07 11:25:40 -07:00 · ce34df4d98
commit ce34df4d98
parent 1d96d7b6eb
1 changed files with 15 additions and 47 deletions
--- a/src/tensors/gpu/prod.cpp
+++ b/src/tensors/gpu/prod.cpp
@ -480,46 +480,25 @@ void ProdBatchedTypedLegacy(marian::Tensor C,
  CUDA_CHECK(cudaSetDevice((int)C->getDeviceId().no));
  ComputeType alpha = scalar;

-  // determine meta-shape of bdot operation. Essentially treat the last two dimensions as single elements
-  // such that (..., m, k) x (..., k, n) -> (..., m, n) where ... is a broadcastable shape as in element-wise kernels.
+  int batchA = A->shape().elements() / (A->shape()[-1] * A->shape()[-2]);
+  int batchB = B->shape().elements() / (B->shape()[-1] * B->shape()[-2]);

-  auto aShape = A->shape();
-  auto bShape = B->shape();
-
-  // make sure both shape have the same number of dimensions via broadcasting
-  size_t maxLength = std::max(aShape.size(), bShape.size());
-  if(aShape.size() != bShape.size()) {
-    Shape ones(std::vector<int>(maxLength, 1));
-    aShape = Shape::broadcast({aShape, ones});
-    bShape = Shape::broadcast({bShape, ones});
-  }
-
-  // Create meta-shapes without last 2 dimensions
-  Shape aShapeMeta, bShapeMeta, cShapeMeta;
-  aShapeMeta.resize(maxLength - 2);
-  bShapeMeta.resize(maxLength - 2);
-  for(size_t i = 0; i < maxLength - 2; ++i) {
-    aShapeMeta.set(i, aShape[i]);
-    bShapeMeta.set(i, bShape[i]);
-  }
-  cShapeMeta = Shape::broadcast({aShapeMeta, bShapeMeta});
-
-  size_t m = aShape[-2];
-  size_t k = aShape[-1];
+  int m = A->shape()[-2];
+  int k = A->shape()[-1];
  if(transA)
    std::swap(m, k);

-  size_t l = bShape[-2];
-  size_t n = bShape[-1];
+  int l = B->shape()[-2];
+  int n = B->shape()[-1];
  if(transB)
    std::swap(l, n);

-  size_t lda = aShape[-1];
-  size_t ldb = bShape[-1];
-  size_t ldc = bShape[-1];
+  int lda = A->shape()[-1];
+  int ldb = B->shape()[-1];
+  int ldc = B->shape()[-1];

  if(transB)
-    ldc = bShape[-2];
+    ldc = B->shape()[-2];

  cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
  cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
@ -528,29 +507,18 @@ void ProdBatchedTypedLegacy(marian::Tensor C,
  auto cublasHandle = backend->getCublasHandle();
  auto compute = backend->getCudaComputeCapability();

-  auto strideA = m * k;
-  auto strideB = n * k;
+  auto strideA = batchA == 1 ? 0 : m * k;
+  auto strideB = batchB == 1 ? 0 : n * k;
  auto strideC = n * m;
-
-  auto batchC = cShapeMeta.elements();
-
-  // Convert to functional shapes to be able to map dimensions. @TODO merge this
-  functional::Shape aShapeMetaF = aShapeMeta;
-  functional::Shape bShapeMetaF = bShapeMeta;
-  functional::Shape cShapeMetaF = cShapeMeta;
+  auto batchC = std::max(batchA, batchB);

  std::vector<const ElementType*> aptr;
  std::vector<const ElementType*> bptr;
  std::vector<ElementType*> cptr;

-  functional::Array<int, functional::Shape::size()> dims;
  for(int i = 0; i < batchC; i++) {
-    cShapeMetaF.dims(i, dims);
-    auto aIndex = aShapeMetaF.bindex(dims);
-    auto bIndex = bShapeMetaF.bindex(dims);
-
-    aptr.push_back(A->data<ElementType>() + aIndex * strideA);
-    bptr.push_back(B->data<ElementType>() + bIndex * strideB);
+    aptr.push_back(A->data<ElementType>() + (i % batchA) * strideA);
+    bptr.push_back(B->data<ElementType>() + (i % batchB) * strideB);
    cptr.push_back(C->data<ElementType>() + i * strideC);
  }