### AclnnDualLevelQuantMatmulWeightNz Example Setup Source: https://github.com/cann/ops-nn/blob/master/matmul/dual_level_quant_batch_matmul/docs/aclnnDualLevelQuantMatmulWeightNz.md Sets up input and output tensor shapes for the AclnnDualLevelQuantMatmulWeightNz operation. Requires specific shape configurations based on quantization parameters. ```cpp #include "aclnnop/aclnn_dual_level_quant_matmul_nz.h" #include "aclnnop/aclnn_npu_format_cast.h" #include "aclnnop/aclnn_cast.h" int AclnnDualLevelQuantMatmulWeightNz(int32_t deviceId, aclrtStream stream) { int ret = 0; // 2. 构造输入与输出，需要根据API的接口自定义构造 constexpr int64_t B4_IN_B8_NUMS = 2L; constexpr int64_t B8_IN_B16_NUMS = 2L; int64_t m = 256; int64_t k = 1024; int64_t n = 512; int64_t level0GroupSize = 512; int64_t level1GroupSize = 32; bool transposeX1 = false; bool transposeX2 = true; std::vector x1Shape = {m, k}; std::vector x2Shape = {n, k}; std::vector biasShape = {n}; std::vector x1Level0ScaleShape = {m, k / level0GroupSize}; std::vector x1Level1ScaleShape = {m, k / level1GroupSize / B8_IN_B16_NUMS, B8_IN_B16_NUMS}; std::vector x2Level0ScaleShape = {k / level0GroupSize, n}; std::vector x2Level1ScaleShape = {n, k / level1GroupSize / B8_IN_B16_NUMS, B8_IN_B16_NUMS}; std::vector outShape = {m, n}; void* x1DeviceAddr = nullptr; void* x2DeviceAddr = nullptr; void* x2NzDeviceAddr = nullptr; void* biasDeviceAddr = nullptr; void* x1Level0ScaleDeviceAddr = nullptr; void* x1Level1ScaleDeviceAddr = nullptr; void* x2Level0ScaleDeviceAddr = nullptr; void* x2Level1ScaleDeviceAddr = nullptr; void* outDeviceAddr = nullptr; void* outFp32DeviceAddr = nullptr; ``` -------------------------------- ### Main Function: CANN Operator Execution Example Source: https://github.com/cann/ops-nn/blob/master/foreach/foreach_minimum_scalar/docs/aclnnForeachMinimumScalar.md Demonstrates the main workflow: initializing the device, constructing input/output tensors, and calling a CANN operator (aclnnForeachMinimumScalar) by first getting the workspace size and then allocating memory for it. ```C++ int main() { // 1. （固定写法）device/stream初始化，参考acl API手册 // 根据自己的实际device填写deviceId int32_t deviceId = 0; aclrtStream stream; auto ret = Init(deviceId, &stream); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("Init acl failed. ERROR: %d\n", ret); return ret); // 2. 构造输入与输出，需要根据API的接口自定义构造 std::vector selfShape1 = {2, 3}; std::vector selfShape2 = {1, 3}; std::vector outShape1 = {2, 3}; std::vector outShape2 = {1, 3}; std::vector alphaShape = {1}; void* input1DeviceAddr = nullptr; void* input2DeviceAddr = nullptr; void* out1DeviceAddr = nullptr; void* out2DeviceAddr = nullptr; void* alphaDeviceAddr = nullptr; aclTensor* input1 = nullptr; aclTensor* input2 = nullptr; aclTensor* alpha = nullptr; aclTensor* out1 = nullptr; aclTensor* out2 = nullptr; std::vector input1HostData = {1, 2, 3, 4, 5, 6}; std::vector input2HostData = {7, 8, 9}; std::vector out1HostData(6, 0); std::vector out2HostData(3, 0); std::vector alphaValueHostData = {1.2f}; float alphaValue = 5.2f; // 创建input1 aclTensor ret = CreateAclTensor(input1HostData, selfShape1, &input1DeviceAddr, aclDataType::ACL_FLOAT, &input1); CHECK_RET(ret == ACL_SUCCESS, return ret); // 创建input2 aclTensor ret = CreateAclTensor(input2HostData, selfShape2, &input2DeviceAddr, aclDataType::ACL_FLOAT, &input2); CHECK_RET(ret == ACL_SUCCESS, return ret); // 创建alpha aclTensor ret = CreateAclTensor(alphaValueHostData, alphaShape, &alphaDeviceAddr, aclDataType::ACL_FLOAT, &alpha); CHECK_RET(ret == ACL_SUCCESS, return ret); // 创建out1 aclTensor ret = CreateAclTensor(out1HostData, outShape1, &out1DeviceAddr, aclDataType::ACL_FLOAT, &out1); CHECK_RET(ret == ACL_SUCCESS, return ret); // 创建out2 aclTensor ret = CreateAclTensor(out2HostData, outShape2, &out2DeviceAddr, aclDataType::ACL_FLOAT, &out2); CHECK_RET(ret == ACL_SUCCESS, return ret); std::vector tempInput{input1, input2}; aclTensorList* tensorListInput = aclCreateTensorList(tempInput.data(), tempInput.size()); std::vector tempOutput{out1, out2}; aclTensorList* tensorListOutput = aclCreateTensorList(tempOutput.data(), tempOutput.size()); // 3. 调用CANN算子库API，需要修改为具体的API名称 uint64_t workspaceSize = 0; aclOpExecutor* executor; // 调用aclnnForeachMinimumScalar第一段接口 ret = aclnnForeachMinimumScalarGetWorkspaceSize(tensorListInput, alpha, tensorListOutput, &workspaceSize, &executor); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnForeachMinimumScalarGetWorkspaceSize failed. ERROR: %d\n", ret); return ret); // 根据第一段接口计算出的workspaceSize申请device内存 void* workspaceAddr = nullptr; if (workspaceSize > 0) { ``` -------------------------------- ### C++ Example: Calling aclnnTransQuantParamV2 API Source: https://github.com/cann/ops-nn/blob/master/quant/trans_quant_param_v2/docs/aclnnTransQuantParamV2.md This snippet demonstrates the complete workflow for calling the aclnnTransQuantParamV2 API. It includes getting workspace size, allocating device memory, executing the API, synchronizing the stream, and copying results back to the host. ```cpp // 3. 调用CANN算子库API，需要修改为具体的API名称 uint64_t workspaceSize = 0; aclOpExecutor* executor = nullptr; // 调用aclnnTransQuantParamV2第一段接口 ret = aclnnTransQuantParamV2GetWorkspaceSize(scale, offset, out, &workspaceSize, &executor); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnTransQuantParamV2GetWorkspaceSize failed. ERROR: %d\n", ret); return ret); // 根据第一段接口计算出的workspaceSize申请device内存 void* workspaceAddr = nullptr; std::unique_ptr workspaceAddrPtr(nullptr, aclrtFree); if (workspaceSize > 0) { ret = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("allocate workspace failed. ERROR: %d\n", ret); return ret); workspaceAddrPtr.reset(workspaceAddr); } // 调用aclnnTransQuantParamV2第二段接口 ret = aclnnTransQuantParamV2(workspaceAddr, workspaceSize, executor, stream); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnTransQuantParamV2 failed. ERROR: %d\n", ret); return ret); // 4. （固定写法）同步等待任务执行结束 ret = aclrtSynchronizeStream(stream); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSynchronizeStream failed. ERROR: %d\n", ret); return ret); // 5. 获取输出的值，将device侧内存上的结果拷贝至host侧，需要根据具体API的接口定义修改 auto size = GetShapeSize(outShape); std::vector resultData(size, 0); ret = aclrtMemcpy( resultData.data(), resultData.size() * sizeof(resultData[0]), outDeviceAddr, size * sizeof(resultData[0]), ACL_MEMCPY_DEVICE_TO_HOST); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("copy result from device to host failed. ERROR: %d\n", ret); return ret); for (int64_t i = 0; i < size; i++) { LOG_PRINT("result[%ld] is: %lu\n", i, resultData[i]); } return ACL_SUCCESS; } int main() { // 1. （固定写法）device/stream初始化，参考acl API手册 // 根据自己的实际device填写deviceId int32_t deviceId = 0; aclrtStream stream; auto ret = aclnnTransQuantParamV2Test(deviceId, stream); CHECK_FREE_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnTransQuantParamV2Test failed. ERROR: %d\n", ret); return ret); Finalize(deviceId, stream); return 0; } ``` -------------------------------- ### C++ Example: Calling aclnnMedianDim API Source: https://github.com/cann/ops-nn/blob/master/index/gather_v2/docs/aclnnMedianDim.md Demonstrates the complete workflow for calling the aclnnMedianDim API, from getting workspace size to freeing resources. Ensure correct API names and data types are used based on your specific needs. ```cpp // 3. 调用CANN算子库API，需要修改为具体的API uint64_t workspaceSize = 0; aclOpExecutor* executor; // 调用aclnnMedianDim第一段接口 ret = aclnnMedianDimGetWorkspaceSize(self, dim, keepDim, valuesOut, indicesOut, &workspaceSize, &executor); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnMedianDimGetWorkspaceSize failed. ERROR: %d\n", ret); return ret); // 根据第一段接口计算出的workspaceSize申请device内存 void* workspaceAddr = nullptr; if (workspaceSize > 0) { ret = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("allocate workspace failed. ERROR: %d\n", ret); return ret;); } // 调用aclnnMedianDim第二段接口 ret = aclnnMedianDim(workspaceAddr, workspaceSize, executor, stream); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnMedianDim failed. ERROR: %d\n", ret); return ret); // 4.（固定写法）同步等待任务执行结束 ret = aclrtSynchronizeStream(stream); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSynchronizeStream failed. ERROR: %d\n", ret); return ret); // 5. 获取输出的值，将device侧内存上的结果拷贝至host侧，需要根据具体API的接口定义修改 auto size = GetShapeSize(valuesOutShape); std::vector valuesOutData(size, 0); ret = aclrtMemcpy(valuesOutData.data(), valuesOutData.size() * sizeof(valuesOutData[0]), valuesOutDeviceAddr, size * sizeof(float), ACL_MEMCPY_DEVICE_TO_HOST); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("copy valuesOut from device to host failed. ERROR: %d\n", ret); return ret); for (int64_t i = 0; i < size; i++) { LOG_PRINT("result[%ld] is: %f\n", i, valuesOutData[i]); } std::vector indicesOutData(size, 0); ret = aclrtMemcpy(indicesOutData.data(), indicesOutData.size() * sizeof(indicesOutData[0]), indicesOutDeviceAddr, size * sizeof(float), ACL_MEMCPY_DEVICE_TO_HOST); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("copy indicesOut from device to host failed. ERROR: %d\n", ret); return ret); for (int64_t i = 0; i < size; i++) { LOG_PRINT("result[%ld] is: %f\n", i, indicesOutData[i]); } // 6. 释放aclTensor和aclScalar，需要根据具体API的接口定义修改 aclDestroyTensor(self); aclDestroyTensor(valuesOut); aclDestroyTensor(indicesOut); // 7. 释放device资源，需要根据具体API的接口定义修改 aclrtFree(selfDeviceAddr); aclrtFree(valuesOutDeviceAddr); aclrtFree(indicesOutDeviceAddr); if (workspaceSize > 0) { aclrtFree(workspaceAddr); } aclrtDestroyStream(stream); aclrtResetDevice(deviceId); aclFinalize(); return 0; } ``` -------------------------------- ### C++ Example: Using aclnnInplacePut API Source: https://github.com/cann/ops-nn/blob/master/index/scatter_nd_update/docs/aclnnInplacePut.md This C++ code demonstrates the complete workflow for calling the aclnnInplacePut API. It includes steps for getting workspace size, allocating memory, executing the inplace operation, synchronizing the stream, copying results back to the host, and cleaning up all resources. ```cpp // 3. 调用CANN算子库API，需要修改为具体的Api名称 uint64_t workspaceSize = 0; aclOpExecutor* executor; // 调用aclnnInplacePut第一段接口 ret = aclnnInplacePutGetWorkspaceSize(self, index, source,false, &workspaceSize, &executor); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnInplacePutGetWorkspaceSize failed. ERROR: %d\n", ret); return ret); // 根据第一段接口计算出的workspaceSize申请device内存 void* workspaceAddr = nullptr; if (workspaceSize > 0) { ret = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("allocate workspace failed. ERROR: %d\n", ret); return ret); } // 调用aclnnInplacePut第二段接口 ret = aclnnInplacePut(workspaceAddr, workspaceSize, executor, stream); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnInplacePut failed. ERROR: %d\n", ret); return ret); // 4.（固定写法）同步等待任务执行结束 ret = aclrtSynchronizeStream(stream); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSynchronizeStream failed. ERROR: %d\n", ret); return ret); // 5. 获取输出的值，将device侧内存上的结果拷贝至host侧，需要根据具体API的接口定义修改 auto size = GetShapeSize(selfShape); std::vector resultData(size, 0); ret = aclrtMemcpy(resultData.data(), resultData.size() * sizeof(resultData[0]), selfDeviceAddr, size * sizeof(float), ACL_MEMCPY_DEVICE_TO_HOST); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("copy result from device to host failed. ERROR: %d\n", ret); return ret); for (int64_t i = 0; i < size; i++) { LOG_PRINT("result[%ld] is: %f\n", i, resultData[i]); } // 6. 释放aclTensor和aclScalar，需要根据具体API的接口定义修改 aclDestroyTensor(self); aclDestroyTensor(index); aclDestroyTensor(source); // 7. 释放device资源，需要根据具体API的接口定义修改 aclrtFree(selfDeviceAddr); aclrtFree(indexDeviceAddr); aclrtFree(sourceDeviceAddr); if (workspaceSize > 0) { aclrtFree(workspaceAddr); } aclrtDestroyStream(stream); aclrtResetDevice(deviceId); aclFinalize(); return 0; } ``` -------------------------------- ### C++ Example for aclnnGatherV2 API Source: https://github.com/cann/ops-nn/blob/master/index/gather_v2/docs/aclnnGatherV2.md This C++ code demonstrates the complete workflow for calling the aclnnGatherV2 API. It covers getting workspace size, allocating device memory, executing the operator, synchronizing the stream, copying results to the host, and releasing all allocated resources. ```cpp // 3. 调用CANN算子库API，需要修改为具体的Api名称 uint64_t workspaceSize = 0; aclOpExecutor* executor; // 调用aclnnGatherV2第一段接口 ret = aclnnGatherV2GetWorkspaceSize(self, dim, index, out, &workspaceSize, &executor); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnGatherV2GetWorkspaceSize failed. ERROR: %d\n", ret); return ret); // 根据第一段接口计算出的workspaceSize申请device内存 void* workspaceAddr = nullptr; if (workspaceSize > 0) { ret = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("allocate workspace failed. ERROR: %d\n", ret); return ret); } // 调用aclnnGatherV2第二段接口 ret = aclnnGatherV2(workspaceAddr, workspaceSize, executor, stream); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnGatherV2 failed. ERROR: %d\n", ret); return ret); // 4.（固定写法）同步等待任务执行结束 ret = aclrtSynchronizeStream(stream); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSynchronizeStream failed. ERROR: %d\n", ret); return ret); // 5. 获取输出的值，将device侧内存上的结果拷贝至host侧，需要根据具体API的接口定义修改 auto size = GetShapeSize(outShape); std::vector resultData(size, 0); ret = aclrtMemcpy(resultData.data(), resultData.size() * sizeof(resultData[0]), outDeviceAddr, size * sizeof(resultData[0]), ACL_MEMCPY_DEVICE_TO_HOST); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("copy result from device to host failed. ERROR: %d\n", ret); return ret); for (int64_t i = 0; i < size; i++) { LOG_PRINT("result[%ld] is: %f\n", i, resultData[i]); } // 6. 释放aclTensor和aclScalar，需要根据具体API的接口定义修改 aclDestroyTensor(self); aclDestroyTensor(index); aclDestroyTensor(out); // 7. 释放device资源，需要根据具体API的接口定义修改 aclrtFree(selfDeviceAddr); aclrtFree(indexDeviceAddr); aclrtFree(outDeviceAddr); if (workspaceSize > 0) { aclrtFree(workspaceAddr); } aclrtDestroyStream(stream); aclrtResetDevice(deviceId); aclFinalize(); return 0; } ``` -------------------------------- ### Aclnn API Invocation Example (C++) Source: https://github.com/cann/ops-nn/blob/master/docs/zh/invocation/quick_op_invocation.md This C++ code demonstrates how to invoke a custom operator using the aclnn API. It includes steps for initializing the device and stream, constructing input and output tensors, getting workspace size, calling the operator, synchronizing the stream, and retrieving output results. Ensure you have the CANN-toolkit and compiled operator package installed. ```C++ int aclnnAddExampleTest(int32_t deviceId, aclrtStream& stream) { // 1. 调用acl进行device/stream初始化 auto ret = Init(deviceId, &stream); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("Init acl failed. ERROR: %d\n", ret); return ret); // 2. 构造输入与输出，需要根据API的接口自定义构造 aclTensor* selfX = nullptr; void* selfXDeviceAddr = nullptr; // 当前样例算子未进行shape、dtype全泛化，其他输入场景可能存在不支持情况 std::vector selfXShape = {32, 4, 4, 4}; std::vector selfXHostData(2048, 1); ret = CreateAclTensor(selfXHostData, selfXShape, &selfXDeviceAddr, aclDataType::ACL_FLOAT, &selfX); // 通过智能指针自动释放aclTensor和device资源 std::unique_ptr selfXPtr(selfX, aclDestroyTensor); std::unique_ptr selfXDeviceAddrPtr(selfXDeviceAddr, aclrtFree); CHECK_RET(ret == ACL_SUCCESS, return ret); aclTensor* selfY = nullptr; void* selfYDeviceAddr = nullptr; std::vector selfYShape = {32, 4, 4, 4}; std::vector selfYHostData(2048, 1); ret = CreateAclTensor(selfYHostData, selfYShape, &selfYDeviceAddr, aclDataType::ACL_FLOAT, &selfY); std::unique_ptr selfYPtr(selfY, aclDestroyTensor); std::unique_ptr selfYDeviceAddrPtr(selfYDeviceAddr, aclrtFree); CHECK_RET(ret == ACL_SUCCESS, return ret); aclTensor* out = nullptr; void* outDeviceAddr = nullptr; std::vector outShape = {32, 4, 4, 4}; std::vector outHostData(2048, 1); ret = CreateAclTensor(outHostData, outShape, &outDeviceAddr, aclDataType::ACL_FLOAT, &out); std::unique_ptr outPtr(out, aclDestroyTensor); std::unique_ptr outDeviceAddrPtr(outDeviceAddr, aclrtFree); CHECK_RET(ret == ACL_SUCCESS, return ret); // 3. 调用CANN算子库API，需要修改为具体的Api名称 uint64_t workspaceSize = 0; aclOpExecutor* executor; // 4. 调用aclnnAddExample第一段接口 ret = aclnnAddExampleGetWorkspaceSize(selfX, selfY, out, &workspaceSize, &executor); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnAddExampleGetWorkspaceSize failed. ERROR: %d\n", ret); return ret); // 根据第一段接口计算出的workspaceSize申请device内存 void* workspaceAddr = nullptr; std::unique_ptr workspaceAddrPtr(nullptr, aclrtFree); if (workspaceSize > static_cast(0)) { ret = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("allocate workspace failed. ERROR: %d\n", ret); return ret); workspaceAddrPtr.reset(workspaceAddr); } // 5. 调用aclnnAddExample第二段接口 ret = aclnnAddExample(workspaceAddr, workspaceSize, executor, stream); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnAddExample failed. ERROR: %d\n", ret); return ret); // 6.（固定写法）同步等待任务执行结束 ret = aclrtSynchronizeStream(stream); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSynchronizeStream failed. ERROR: %d\n", ret); return ret); // 7. 获取输出的值，将device侧内存上的结果拷贝至host侧，需要根据具体API的接口定义修改 PrintOutResult(outShape, &outDeviceAddr, selfXHostData, selfYHostData); return ACL_SUCCESS; } int main() { int32_t deviceId = 0; aclrtStream stream; auto ret = aclnnAddExampleTest(deviceId, stream); // 释放device资源以及acl去初始化 aclrtDestroyStream(stream); aclrtResetDevice(deviceId); aclFinalize(); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnAddExampleTest failed. ERROR: %d\n", ret); return ret); return 0; } ``` -------------------------------- ### Run Example Compilation and Execution Source: https://github.com/cann/ops-nn/blob/master/docs/zh/install/build.md Compile and execute the specified operator and mode examples. Use '--run_example --help' to view usage instructions. ```bash bash build.sh --run_example ``` -------------------------------- ### Tiling UT Basic Workflow Example Source: https://github.com/cann/ops-nn/blob/master/docs/zh/develop/aicore_develop_guide.md A simplified example demonstrating the basic workflow of a Tiling Unit Test. It covers constructing the context, setting expected results, and executing the test case. ```CPP TEST_F(${OpName}TilingTest, test_case_xxx) { // 声明结构体并初始化一个结构体变量 struct ${OpName}CompileInfo { } compileInfo; // 1. 构造用例上下文 gert::TilingContextPara tilingContextPara( "${OpName}", { {{{32, 4, 4, 4}, {32, 4, 4, 4}}, ge::DT_FLOAT, ge::FORMAT_ND}, // input tensor1 {{{32, 4, 4, 4}, {32, 4, 4, 4}}, ge::DT_FLOAT, ge::FORMAT_ND}, // input tensor2 // 若输入为ValueDepend，需额外传入true和constValue这两个参数 // 其中constValue为自己定义的变量，如int constValue[2] = {2, 2} // {{{32, 4, 4, 4}, {32, 4, 4, 4}}, ge::DT_FLOAT, ge::FORMAT_ND, true, constValue} }, { {{{32, 4, 4, 4}, {32, 4, 4, 4}}, ge::DT_FLOAT, ge::FORMAT_ND}, // output tensor }, { // 属性 gert::TilingContextPara::OpAttr("${attr_name}", AnyValue::CreateFrom("${attr_value}")) }, &compileInfo, 64, // tiling阶段获取的核数 262144, // tiling阶段湖区的ub大小，但实际获取的值比指定值少256字节 4096 // 指定tiling阶段中tiling data的最大值 ); // 2. 设定预期结果 uint64_t expectTilingKey = 0; string expectTilingData = "2048 32 10912 "; std::vector expectWorkspaces = {0}; // 3. 调用接口执行用例 ExecuteTestCase(tilingContextPara, ge::GRAPH_SUCCESS, expectTilingKey, expectTilingData, expectWorkspaces); } ``` -------------------------------- ### ForeachAddcdivScalarList Operator C++ Example Source: https://github.com/cann/ops-nn/blob/master/foreach/foreach_addcdiv_scalar_list/README.md This snippet demonstrates how to call the ForeachAddcdivScalarList operator using the aclnn interface in C++. Ensure the necessary headers and setup are in place for aclnn operations. ```cpp #include #include #include "acl/acl.h" #include "aclnn/hal_api.h" #include "aclnn/types.h" int main() { // Initialize ACL aclError ret = aclInit("acl.json"); if (ret != ACL_ERROR_NONE) { std::cerr << "ACL init failed: " << ret << std::endl; return -1; } // Create input tensors (example data) std::vector x1_data = {1.0f, 2.0f, 3.0f}; std::vector x2_data = {4.0f, 5.0f, 6.0f}; std::vector x3_data = {2.0f, 1.0f, 3.0f}; std::vector scalars_data = {0.5f, 1.0f, 2.0f}; // Create ACL tensors aclTensor *x1 = nullptr; aclTensor *x2 = nullptr; aclTensor *x3 = nullptr; aclTensor *scalars = nullptr; aclTensor *y = nullptr; // Allocate device memory and create aclTensor objects // ... (code to allocate device memory and create aclTensor from data) // Call ForeachAddcdivScalarList operator ret = aclnnForeachAddcdivScalarList(x1, x2, x3, scalars, &y, ACL_FORMAT_ND, ACL_DTYPE_FLOAT32); if (ret != ACL_ERROR_NONE) { std::cerr << "aclnnForeachAddcdivScalarList failed: " << ret << std::endl; // ... (cleanup) return -1; } // Process output tensor y // ... (code to copy data from device to host and verify) // Destroy tensors and release resources // ... (code to destroy tensors and release resources) // Destroy ACL aclFinal(); return 0; } ``` -------------------------------- ### Main Function - Setup and API Call Source: https://github.com/cann/ops-nn/blob/master/foreach/foreach_add_scalar/docs/aclnnForeachAddScalarV2.md Sets up input/output tensors and calls the aclnnForeachAddScalarV2 operator. Requires proper initialization and tensor construction. ```Cpp int main() { // 1. （固定写法）device/stream初始化，参考acl API手册 // 根据自己的实际device填写deviceId int32_t deviceId = 0; aclrtStream stream; auto ret = Init(deviceId, &stream); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("Init acl failed. ERROR: %d\n", ret); return ret); // 2. 构造输入与输出，需要根据API的接口自定义构造 std::vector selfShape1 = {2, 3}; std::vector selfShape2 = {1, 3}; std::vector outShape1 = {2, 3}; std::vector outShape2 = {1, 3}; void* input1DeviceAddr = nullptr; void* input2DeviceAddr = nullptr; void* out1DeviceAddr = nullptr; void* out2DeviceAddr = nullptr; aclTensor* input1 = nullptr; aclTensor* input2 = nullptr; aclScalar* alpha = nullptr; aclTensor* out1 = nullptr; aclTensor* out2 = nullptr; std::vector input1HostData = {1, 2, 3, 4, 5, 6}; std::vector input2HostData = {7, 8, 9}; std::vector out1HostData(6, 0); std::vector out2HostData(3, 0); float alphaValue = 1.2f; // 创建input1 aclTensor ret = CreateAclTensor(input1HostData, selfShape1, &input1DeviceAddr, aclDataType::ACL_FLOAT, &input1); CHECK_RET(ret == ACL_SUCCESS, return ret); // 创建input2 aclTensor ret = CreateAclTensor(input2HostData, selfShape2, &input2DeviceAddr, aclDataType::ACL_FLOAT, &input2); CHECK_RET(ret == ACL_SUCCESS, return ret); // 创建alpha aclScalar alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); CHECK_RET(alpha != nullptr, return ret); // 创建out1 aclTensor ret = CreateAclTensor(out1HostData, outShape1, &out1DeviceAddr, aclDataType::ACL_FLOAT, &out1); CHECK_RET(ret == ACL_SUCCESS, return ret); // 创建out2 aclTensor ret = CreateAclTensor(out2HostData, outShape2, &out2DeviceAddr, aclDataType::ACL_FLOAT, &out2); CHECK_RET(ret == ACL_SUCCESS, return ret); std::vector tempInput{input1, input2}; aclTensorList* tensorListInput = aclCreateTensorList(tempInput.data(), tempInput.size()); std::vector tempOutput{out1, out2}; aclTensorList* tensorListOutput = aclCreateTensorList(tempOutput.data(), tempOutput.size()); // 3. 调用CANN算子库API，需要修改为具体的API名称 uint64_t workspaceSize = 0; aclOpExecutor* executor; // 调用aclnnForeachAddScalarV2第一段接口 ret = aclnnForeachAddScalarV2GetWorkspaceSize(tensorListInput, alpha, tensorListOutput, &workspaceSize, &executor); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnForeachAddScalarV2GetWorkspaceSize failed. ERROR: %d\n", ret); return ret); // 根据第一段接口计算出的workspaceSize申请device内存 void* workspaceAddr = nullptr; if (workspaceSize > 0) { ret = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("allocate workspace failed. ERROR: %d\n", ret); return ret); } ``` -------------------------------- ### Execute BatchNormReduce Operator Source: https://github.com/cann/ops-nn/blob/master/norm/bn_training_reduce/docs/aclnnBatchNormReduce.md This example demonstrates the execution of the aclnnBatchNormReduce operator. It involves getting the workspace size, allocating workspace memory, and then calling the operator with the executor and stream. Synchronization is required after execution. ```C++ int main(int argc, char* argv[]) { // 1. （固定写法）device/stream初始化，参考acl API手册 // 根据自己的实际device填写deviceId int32_t deviceId = 0; aclrtStream stream; auto ret = Init(deviceId, &stream); // check根据自己的需要处理 CHECK_RET(ret == 0, LOG_PRINT("Init acl failed. ERROR: %d\n", ret); return ret); // 2. 构造输入与输出，需要根据API的接口自定义构造 void* xDeviceAddr = nullptr; void* sumDeviceAddr = nullptr; void* squareSumDeviceAddr = nullptr; aclTensor* x = nullptr; aclTensor* sum = nullptr; aclTensor* squareSum = nullptr; std::vector xShape = {1, 2, 3, 4}; std::vector sumShape = {2}; std::vector squareSumShape = {2}; std::vector xHostData(24, 1); std::vector sumHostData(2, 0); std::vector squareSumHostData(2, 0); // 创建x aclTensor ret = CreateAclTensor(xHostData, xShape, &xDeviceAddr, aclDataType::ACL_FLOAT, &x, aclFormat::ACL_FORMAT_NCHW); CHECK_RET(ret == ACL_SUCCESS, return ret); // 创建sum aclTensor ret = CreateAclTensor(sumHostData, sumShape, &sumDeviceAddr, aclDataType::ACL_FLOAT, &sum, aclFormat::ACL_FORMAT_ND); CHECK_RET(ret == ACL_SUCCESS, return ret); // 创建squareSum aclTensor ret = CreateAclTensor( squareSumHostData, squareSumShape, &squareSumDeviceAddr, aclDataType::ACL_FLOAT, &squareSum, aclFormat::ACL_FORMAT_ND); CHECK_RET(ret == ACL_SUCCESS, return ret); // 3. 调用CANN算子库API，需要修改为具体的API uint64_t workspaceSize = 0; aclOpExecutor* executor; // 调用aclnnBatchNormReduce第一段接口 ret = aclnnBatchNormReduceGetWorkspaceSize(x, sum, squareSum, &workspaceSize, &executor); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnBatchNormReduceGetWorkspaceSize failed. ERROR: %d\n", ret); return ret); // 根据第一段接口计算出的workspaceSize申请device内存 void* workspaceAddr = nullptr; if (workspaceSize > 0) { ret = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("allocate workspace failed. ERROR: %d\n", ret); return ret;); } // 调用aclnnBatchNormReduce第二段接口 ret = aclnnBatchNormReduce(workspaceAddr, workspaceSize, executor, stream); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnBatchNormReduce failed. ERROR: %d\n", ret); return ret); // 4. （固定写法）同步等待任务执行结束 ret = aclrtSynchronizeStream(stream); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSynchronizeStream failed. ERROR: %d\n", ret); return ret); // 5. 获取输出的值，将device侧内存上的结果拷贝至host侧，需要根据具体API的接口定义修改 auto size = GetShapeSize(sumShape); std::vector resultData(size, 0); ret = aclrtMemcpy( ``` -------------------------------- ### Example Build Output Source: https://github.com/cann/ops-nn/blob/master/docs/zh/invocation/quick_op_invocation.md This is an example of the output you might see after successfully running the build and execution script. ```text INFO - [XIR]: Finalize ir graph session success ``` -------------------------------- ### C++ Example for aclnnForeachAddcmulScalar Source: https://github.com/cann/ops-nn/blob/master/foreach/foreach_addcmul_scalar/docs/aclnnForeachAddcmulScalar.md This C++ code demonstrates how to initialize the ACL environment, create tensors, and prepare for calling the aclnnForeachAddcmulScalar function. It includes helper functions for tensor creation and environment setup. ```cpp #include #include #include "acl/acl.h" #include "aclnnop/aclnn_foreach_addcmul_scalar.h" #define CHECK_RET(cond, return_expr) \ do { \ if (!(cond)) { \ return_expr; \ } \ } while (0) #define LOG_PRINT(message, ...) \ do { \ printf(message, ##__VA_ARGS__); \ } while (0) int64_t GetShapeSize(const std::vector& shape) { int64_t shapeSize = 1; for (auto i : shape) { shapeSize *= i; } return shapeSize; } int Init(int32_t deviceId, aclrtStream *stream) { // 固定写法，资源初始化 auto ret = aclInit(nullptr); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclInit failed. ERROR: %d\n", ret); return ret); ret = aclrtSetDevice(deviceId); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret); return ret); ret = aclrtCreateStream(stream); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtCreateStream failed. ERROR: %d\n", ret); return ret); return 0; } template int CreateAclTensor(const std::vector& hostData, const std::vector& shape, void** deviceAddr, aclDataType dataType, aclTensor** tensor) { auto size = GetShapeSize(shape) * sizeof(T); // 调用aclrtMalloc申请device侧内存 auto ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret); return ret); // 调用aclrtMemcpy将host侧数据复制到device侧内存上 ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size, ACL_MEMCPY_HOST_TO_DEVICE); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMemcpy failed. ERROR: %d\n", ret); return ret); // 计算连续tensor的strides std::vector strides(shape.size(), 1); for (int64_t i = shape.size() - 2; i >= 0; i--) { strides[i] = shape[i + 1] * strides[i + 1]; } // 调用aclCreateTensor接口创建aclTensor *tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND, shape.data(), shape.size(), *deviceAddr); return 0; } int main() { // 1. （固定写法）device/stream初始化，参考acl API手册 // 根据自己的实际device填写deviceId int32_t deviceId = 0; aclrtStream stream; auto ret = Init(deviceId, &stream); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("Init acl failed. ERROR: %d\n", ret); return ret); ```