### Install SCALE on Ubuntu

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/how-to-install.md

Installs SCALE on Ubuntu by setting up the SCALE repository and then installing the SCALE package. It also adds the user to the 'video' group. Requires `wget` and `apt`.

```bash
export CUSTOMER_NAME="<customer-username>"
export CUSTOMER_PASSWORD="<customer-password>"

sudo tee /etc/apt/auth.conf.d/scale.conf <<EOF
machine {{repo_subdomain}}.scale-lang.com
login $CUSTOMER_NAME
password $CUSTOMER_PASSWORD
EOF
chmod 700 /etc/apt/auth.conf.d/scale.conf
# Add the scale deb repos.
wget --http-user="$CUSTOMER_NAME" --http-password="$CUSTOMER_PASSWORD" https://{{repo_subdomain}}.scale-lang.com/$CUSTOMER_NAME/deb/dists/{{deb_os.codename}}/main/binary-all/scale-repos.deb
sudo apt install ./scale-repos.deb

# Update package list
sudo apt update

sudo apt install {{ scale_pkgname }}

# Add your user to the `video` group:
sudo usermod -a -G video $(whoami)
```

```bash
# Add the scale deb repos.
wget https://{{repo_subdomain}}.scale-lang.com/deb/dists/{{deb_os.codename}}/main/binary-all/scale-repos.deb
sudo apt install ./scale-repos.deb

# Update package list
sudo apt update

sudo apt install {{ scale_pkgname }}

# Add your user to the `video` group:
sudo usermod -a -G video $(whoami)
```

--------------------------------

### Build and Run SCALE Example using example.sh

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/examples/README.md

This bash script automates the process of building and running a SCALE example. It requires the SCALE target directory and the example name as arguments. The script handles directory cleanup, CMake configuration, Make building, and execution with specific environment variables for error reporting.

```bash
# You should be in the `examples` directory of the `scale-docs` repository
./example.sh /opt/scale gfx1030 basic
```

--------------------------------

### Clone SCALE Examples Repository

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/examples/README.md

This command clones the public GitHub repository containing the SCALE example programs and navigates into the examples directory. It's the first step to accessing the provided code.

```sh
git clone https://github.com/spectral-compute/scale-docs.git
cd scale-docs/examples
```

--------------------------------

### Install SCALE on Ubuntu

Source: https://context7.com/spectral-compute/scale-docs/llms.txt

Installs SCALE on Ubuntu 24.04 by adding the repository, updating the package list, and installing the SCALE package. It also adds the user to the video group and suggests a reboot if necessary.

```bash
# Add the SCALE deb repository
wget https://repo.scale-lang.com/deb/dists/noble/main/binary-all/scale-repos.deb
sudo apt install ./scale-repos.deb

# Update package list and install SCALE
sudo apt update
sudo apt install scale

# Add your user to the video group
sudo usermod -a -G video $(whoami)

# Reboot if amdgpu-dkms was newly installed

```

--------------------------------

### CMake Configuration for cuBLAS Example

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/examples/blas.md

This CMakeLists.txt file configures the build process for the cuBLAS example. It specifies the minimum required CMake version, enables CUDA, finds the cuBLAS library, and links it to the executable. Ensure that the CUDA toolkit is installed and discoverable by CMake.

```cmake
cmake_minimum_required(VERSION 3.10)

project(cublas_example)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 11)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)

find_package(CUDA REQUIRED)

add_executable(cublas_example src/blas/blas.cu)

target_link_libraries(cublas_example PRIVATE cudart cublas)

```

--------------------------------

### Install SCALE on Rocky Linux

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/how-to-install.md

Installs SCALE on Rocky Linux by setting up the SCALE RPM repository and then installing the SCALE package. Requires `wget` and `dnf`.

```bash
export CUSTOMER_NAME="<customer-username>"
export CUSTOMER_PASSWORD="<customer-password>"

# Add the scale rpm repos.
wget --http-user="$CUSTOMER_NAME" --http-password="$CUSTOMER_PASSWORD" https://{{repo_subdomain}}.scale-lang.com/$CUSTOMER_NAME/rpm/el9/main/scale-repos.rpm
sudo dnf install ./scale-repos.rpm

# Tell dnf to authenticate to the repo
sudo tee -a /etc/yum.repos.d/scale.repo <<EOF
username = $CUSTOMER_NAME
password = $CUSTOMER_PASSWORD
EOF
chmod 700 /etc/yum.repos.d/scale.repo

# Install SCALE
sudo dnf install {{ scale_pkgname }}
```

```bash
# Add the scale rpm repos.
sudo dnf install https://{{repo_subdomain}}.scale-lang.com/rpm/el9/main/scale-repos.rpm

# Install SCALE
sudo dnf install {{ scale_pkgname }}
```

--------------------------------

### Basic CUDA Kernel Execution Example

Source: https://context7.com/spectral-compute/scale-docs/llms.txt

A C++ example demonstrating basic CUDA kernel execution, including memory allocation on the device, data transfer between host and device, kernel launch using `<<<>>>` syntax, and result verification. It utilizes a helper function for CUDA error checking.

```cpp
#include <vector>
#include <iostream>

// Define a CUDA kernel for element-wise addition
__global__ void basicSum(const int * a, const int * b, size_t n, int * out) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if(idx < n) {
        out[idx] = a[idx] + b[idx];
    }
}

// Error checking helper
void check(cudaError_t error, const char * file, size_t line) {
    if (error != cudaSuccess) {
        std::cout << "cuda error: " << cudaGetErrorString(error)
                  << " at " << file << ":" << line << std::endl;
        exit(1);
    }
}
#define CHECK(error) check(error, __FILE__, __LINE__)

int main() {
    const size_t N = 4096;
    const size_t BYTES = N * sizeof(int);

    std::vector<int> a(N), b(N), out(N);
    for (size_t i = 0; i < N; i++) {
        a[i] = i * 2;
        b[i] = N - i;
    }

    int *devA, *devB, *devOut;

    // Allocate device memory
    CHECK(cudaMalloc(&devA, BYTES));
    CHECK(cudaMalloc(&devB, BYTES));
    CHECK(cudaMalloc(&devOut, BYTES));

    // Copy input data to device
    CHECK(cudaMemcpy(devA, a.data(), BYTES, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(devB, b.data(), BYTES, cudaMemcpyHostToDevice));

    // Launch kernel with 256 threads per block
    basicSum<<<N / 256 + 1, 256>>>(devA, devB, N, devOut);
    CHECK(cudaDeviceSynchronize());
    CHECK(cudaGetLastError());

    // Copy results back to host
    CHECK(cudaMemcpy(out.data(), devOut, BYTES, cudaMemcpyDeviceToHost));

    // Free device memory
    CHECK(cudaFree(devA));
    CHECK(cudaFree(devB));
    CHECK(cudaFree(devOut));

    // Verify results
    for (size_t i = 0; i < N; i++) {
        if (a[i] + b[i] != out[i]) {
            std::cout << "Error at index " << i << std::endl;
        }
    }
    return 0;
}

```

--------------------------------

### CMake Build Configuration (CMake)

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/examples/ptx.md

This CMakeLists.txt file configures the build process for the PTX example. It specifies the project name, C++ standard, and includes the necessary SCALE CUDA library, enabling the compilation of the PTX kernel.

```cmake
cmake_minimum_required(VERSION 3.10)

project(ptx CXX)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

find_package(SCALE REQUIRED)

add_executable(ptx ptx.cu)

target_link_libraries(ptx PRIVATE SCALE::cuda)

```

--------------------------------

### CMake Build Configuration (CMake)

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/examples/basic.md

This CMakeLists.txt file configures the build process for the CUDA example. It specifies the project name, required CMake version, and enables CUDA support. It then defines the executable and links it against the CUDA runtime library.

```cmake
cmake_minimum_required(VERSION 3.10)

project("basic_cuda_example")

find_package(CUDA REQUIRED)

add_executable(basic_cuda_example src/basic/basic.cu)

target_link_libraries(basic_cuda_example PRIVATE CUDA::cudart)

```

--------------------------------

### Remove SCALE Repositories (Ubuntu)

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/how-to-install.md

Removes all SCALE-related repositories and configuration files on Ubuntu systems. This is a prerequisite for re-adding repositories if they become corrupted or outdated.

```bash
sudo apt-get remove 'scale-repos*'
sudo rm -f /etc/apt/sources.list.d/scale.list /etc/apt/auth.conf.d/scale.conf
```

--------------------------------

### Query CUDA Module Information

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/api-driver.md

Functions to retrieve information about loaded CUDA modules, such as getting a handle to a function or the memory address and size of a global variable. Also includes functions to get the module loading mode.

```c
CUresult cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name);
CUresult cuModuleGetGlobal(CUdeviceptr* dptr, size_t* bytes, CUmodule hmod, const char* name);
CUresult cuModuleGetLoadingMode(CUmoduleLoadingMode* mode);
```

--------------------------------

### Download and Extract SCALE Tarball

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/how-to-install.md

Downloads and extracts the SCALE tarball for systems not covered by package managers. This method includes many dependent libraries directly. Requires `wget` and `tar`.

```bash
export CUSTOMER_NAME="<customer-username>"
export CUSTOMER_PASSWORD="<customer-password>"

wget --http-user="$CUSTOMER_NAME" --http-password="$CUSTOMER_PASSWORD" https://{{repo_subdomain}}.scale-lang.com/$CUSTOMER_NAME/tar/{{scale_pkgname}}-latest-amd64.tar.xz
# Extract it to the current directory
tar xf {{scale_pkgname}}-latest-amd64.tar.xz
```

```bash
# Download the tarball
wget https://{{ repo_subdomain }}.scale-lang.com/tar/{{scale_pkgname}}-latest-amd64.tar.xz
# Extract it to the current directory
tar xf {{scale_pkgname}}-latest-amd64.tar.xz
```

--------------------------------

### Remove SCALE Repositories (Rocky Linux)

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/how-to-install.md

Removes all SCALE-related repositories and configuration files on Rocky Linux systems. This is a prerequisite for re-adding repositories if they become corrupted or outdated.

```bash
sudo dnf remove 'scale-repos*'
sudo rm -f /etc/yum.repos.d/scale.repo
```

--------------------------------

### Profiler Control API

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/api-driver.md

Functions for controlling the CUDA profiler, including initialization, starting, and stopping.

```APIDOC
## Profiler Control API

### Description
Functions for controlling the CUDA profiler, including initialization, starting, and stopping.

### Methods

#### `cuProfilerInitialize`
Initializes the CUDA profiler with configuration and output files.

- **Method**: `CUresult cuProfilerInitialize(const char* configFile, const char* outputFile, CUoutput_mode outputMode)`

#### `cuProfilerStart`
Starts the CUDA profiler.

- **Method**: `CUresult cuProfilerStart(void)`

#### `cuProfilerStop`
Stops the CUDA profiler.

- **Method**: `CUresult cuProfilerStop(void)`
```

--------------------------------

### Host Node Parameter Management

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/api-driver.md

Functions to get and set parameters for host nodes in a CUDA graph.

```APIDOC
## Host Node Parameter Management

### Description
Functions to retrieve and set the parameters for host nodes within a CUDA graph.

### Method
`CUresult`

### Endpoints
- `cuGraphHostNodeGetParams`
- `cuGraphHostNodeSetParams`

### Parameters
#### Path Parameters
None

#### Query Parameters
None

#### Request Body
None (parameters are passed as function arguments)

### Request Example
None

### Response
#### Success Response (CU_SUCCESS)
- `CUresult` (CU_SUCCESS indicates success)

#### Response Example
`CU_SUCCESS`
```

--------------------------------

### Memory Node Parameter Management

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/api-driver.md

Functions to get and set parameters for memory allocation, free, copy, and set nodes.

```APIDOC
## Memory Node Parameter Management

### Description
Functions for retrieving and setting parameters for nodes related to memory operations like allocation, free, copy, and memset.

### Method
`CUresult`

### Endpoints
- `cuGraphMemAllocNodeGetParams`
- `cuGraphMemFreeNodeGetParams`
- `cuGraphMemcpyNodeGetParams`
- `cuGraphMemcpyNodeSetParams`
- `cuGraphMemsetNodeGetParams`
- `cuGraphMemsetNodeSetParams`

### Parameters
#### Path Parameters
None

#### Query Parameters
None

#### Request Body
None (parameters are passed as function arguments)

### Request Example
None

### Response
#### Success Response (CU_SUCCESS)
- `CUresult` (CU_SUCCESS indicates success)

#### Response Example
`CU_SUCCESS`
```

--------------------------------

### Get CUDA Driver Version

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/api-driver.md

Retrieves the version of the installed CUDA driver. This function is useful for checking compatibility and ensuring the correct CUDA toolkit is being used. It takes a pointer to an integer where the driver version will be stored.

```c
CUresult cuDriverGetVersion(int* driverVersion);
```

--------------------------------

### Get CUDA Driver and Runtime Versions

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/api-runtime.md

Retrieves the installed CUDA driver version and the CUDA runtime version. These functions are essential for compatibility checks and understanding the CUDA environment. They take a pointer to an integer to store the version number.

```c++
__host__ cudaError_t cudaDriverGetVersion(int* driverVersion);
__host__  cudaError_t cudaRuntimeGetVersion(int* runtimeVersion);
__device__ cudaError_t cudaRuntimeGetVersion(int* runtimeVersion);
```

--------------------------------

### Perform Double-Precision Dot-Product using cuBLAS

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/examples/blas.md

This C++ CUDA code performs a double-precision dot-product operation. It utilizes cuBLAS library functions, which are compatible with SCALE and can be forwarded to ROCm APIs for execution on AMD GPUs. Ensure cuBLAS is correctly installed and linked.

```cpp
#include <iostream>
#include <vector>

#include <cublas_v2.h>

// Error checking for CUDA and cuBLAS calls
#define CUDA_CHECK(call)                                                              \
    do {                                                                            \
        cudaError_t err = call;                                                     \
        if (err != cudaSuccess) {                                                   \
            fprintf(stderr, "CUDA error at %s:%d - %s\n", __FILE__, __LINE__,       \
                    cudaGetErrorString(err));                                           \
            exit(EXIT_FAILURE);                                                     \
        }                                                                           \
    } while (0)

#define CUBLAS_CHECK(call)                                                          \
    do {                                                                            \
        cublasStatus_t status = call;                                               \
        if (status != CUBLAS_STATUS_SUCCESS) {                                      \
            fprintf(stderr, "CUBLAS error at %s:%d - %d\n", __FILE__, __LINE__,     \
                    status);                                                            \
            exit(EXIT_FAILURE);                                                     \
        }                                                                           \
    } while (0)

int main() {
    int n = 1024;
    std::vector<double> h_a(n, 1.0);
    std::vector<double> h_b(n, 2.0);
    std::vector<double> h_c(1, 0.0);

    // Allocate device memory
    double *d_a, *d_b, *d_c;
    CUDA_CHECK(cudaMalloc(&d_a, n * sizeof(double)));
    CUDA_CHECK(cudaMalloc(&d_b, n * sizeof(double)));
    CUDA_CHECK(cudaMalloc(&d_c, sizeof(double)));

    // Copy data from host to device
    CUDA_CHECK(cudaMemcpy(d_a, h_a.data(), n * sizeof(double), cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_b, h_b.data(), n * sizeof(double), cudaMemcpyHostToDevice));

    // Initialize cuBLAS
    cublasHandle_t handle;
    CUBLAS_CHECK(cublasCreate(&handle));

    // Perform dot product
    CUBLAS_CHECK(cublasDdot(handle, n, d_a, 1, d_b, 1, d_c));

    // Copy result from device to host
    CUDA_CHECK(cudaMemcpy(h_c.data(), d_c, sizeof(double), cudaMemcpyDeviceToHost));

    // Print result
    std::cout << "Dot product result: " << h_c[0] << std::endl;

    // Clean up
    CUBLAS_CHECK(cublasDestroy(handle));
    CUDA_CHECK(cudaFree(d_a));
    CUDA_CHECK(cudaFree(d_b));
    CUDA_CHECK(cudaFree(d_c));

    return 0;
}

```

--------------------------------

### Diagnosing SCALE Initialization Errors and No Devices Found

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/troubleshooting.md

This section provides example error messages and diagnostic commands for troubleshooting SCALE initialization failures. It covers issues like outdated AMD kernel modules, incorrect `/dev/kfd` permissions, and the absence of supported GPUs. Commands like `scaleinfo` and `hsakmtsysinfo` are shown, along with the expected output for specific errors.

```bash
$ SCALE_EXCEPTIONS=1 ./myProgram
terminate called after throwing an instance of 'redscale::SimpleException'
  what():  cudaDeviceSynchronize: No usable CUDA devices found., CUDA error: "no device"
Aborted (core dumped)
```

```bash
$ /opt/scale/bin/scaleinfo
Error getting device count: initialization error
```

```bash
$ /opt/scale/bin/hsakmtsysinfo
terminate called after throwing an instance of 'std::runtime_error'
  what():  HSAKMT Error 20: Could not open KFD
Aborted (core dumped)
```

```bash
# Verify GPU architecture
/opt/scale/bin/hsasysinfo | grep 'Name: gfx'
```

```bash
# Ensure /dev/kfd is writable (Ubuntu example)
sudo usermod -a -G render USERNAME

# Temporarily make /dev/kfd world-writable
sudo chmod 666 /dev/kfd
```

--------------------------------

### Configure CUDA Executable with CMake

Source: https://github.com/spectral-compute/scale-docs/blob/master/examples/src/basic/CMakeLists.txt

This snippet shows how to configure a basic CUDA executable using CMake. It specifies the minimum CMake version, project name, supported languages (CUDA), and adds the source file 'basic.cu' to create the executable 'example_basic'.

```cmake
cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
project(example_basic LANGUAGES CUDA)

add_executable(example_basic basic.cu)
```

--------------------------------

### CUDA Graph Creation and Execution in SCALE (C++)

Source: https://context7.com/spectral-compute/scale-docs/llms.txt

Illustrates how to create, instantiate, launch, and update CUDA Graphs using SCALE. This example includes adding kernel and memory copy nodes to a graph and executing it on a stream.

```cpp
__global__ void addKernel(float* a, float* b, float* c, int n) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < n) c[idx] = a[idx] + b[idx];
}

int main() {
    const int N = 1024;
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, N * sizeof(float));
    cudaMalloc(&d_b, N * sizeof(float));
    cudaMalloc(&d_c, N * sizeof(float));

    // Create graph
    cudaGraph_t graph;
    cudaGraphCreate(&graph, 0);

    // Add kernel node
    cudaGraphNode_t kernelNode;
    cudaKernelNodeParams kernelParams = {};
    void* args[] = {&d_a, &d_b, &d_c, (void*)&N};
    kernelParams.func = (void*)addKernel;
    kernelParams.gridDim = dim3((N + 255) / 256);
    kernelParams.blockDim = dim3(256);
    kernelParams.kernelParams = args;
    cudaGraphAddKernelNode(&kernelNode, graph, nullptr, 0, &kernelParams);

    // Add memcpy nodes
    cudaGraphNode_t memcpyNode;
    float h_data[N];
    cudaGraphAddMemcpyNode1D(&memcpyNode, graph, &kernelNode, 1,
                             h_data, d_c, N * sizeof(float),
                             cudaMemcpyDeviceToHost);

    // Instantiate graph
    cudaGraphExec_t graphExec;
    cudaGraphInstantiate(&graphExec, graph, 0);

    // Launch graph
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    cudaGraphLaunch(graphExec, stream);
    cudaStreamSynchronize(stream);

    // Update and re-execute
    cudaGraphExecUpdate(graphExec, graph, nullptr);
    cudaGraphLaunch(graphExec, stream);

    // Cleanup
    cudaGraphExecDestroy(graphExec);
    cudaGraphDestroy(graph);
    cudaStreamDestroy(stream);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}

```

--------------------------------

### CUDA Graph Node Get Enabled Status

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/api-runtime.md

Gets the enabled status of a node in an executable CUDA graph.

```APIDOC
## cudaGraphNodeGetEnabled

### Description
Gets the enabled status of a node in an executable CUDA graph.

### Method
`__host__`

### Endpoint
`cudaGraphNodeGetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, unsigned int* isEnabled)`

### Parameters
#### Path Parameters
None

#### Query Parameters
None

#### Request Body
- **hGraphExec** (`cudaGraphExec_t`) - The handle to the executable graph.
- **hNode** (`cudaGraphNode_t`) - The handle of the node.
- **isEnabled** (`unsigned int*`) - Pointer to store the enabled status (0 for disabled, 1 for enabled).

### Request Example
```json
{
  "hGraphExec": "graph_exec_handle",
  "hNode": "node_handle",
  "isEnabled": "output_status_pointer"
}
```

### Response
#### Success Response (0)
- **cudaError_t** - Returns `cudaSuccess` on success.

#### Response Example
```json
{
  "status": "cudaSuccess"
}
```
```

--------------------------------

### Example API Diff Syntax Highlighting

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/apis.md

Demonstrates the use of `diff` syntax highlighting to represent API coverage. Lines prefixed with '-' indicate missing APIs, while other lines show available APIs. This format helps visualize the differences between SCALE and NVIDIA's API documentation.

```diff
const char * cudaGetErrorName(cudaError_t);
__device__ const char * cudaGetErrorName(cudaError_t);
const char * cudaGetErrorString(cudaError_t);
__device__ const char * cudaGetErrorString(cudaError_t);
cudaError_t cudaGetLastError();
-__device__ cudaError_t cudaGetLastError();
cudaError_t cudaPeekAtLastError();
-__device__ cudaError_t cudaPeekAtLastError();
```

--------------------------------

### Resolve 'cstddef' file not found error

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/troubleshooting.md

This error indicates that the C++ standard library include files, specifically 'cstddef', cannot be found. This can occur on distributions like Ubuntu if multiple GCC versions are installed but the corresponding g++ version is missing. Ensure that the g++ version matching your gcc installation is installed.

```c++
In file included from <built-in>:1:

In file included from /opt/scale/targets/gfx1100/include/redscale_impl/device.h:6:

In file included from /opt/scale/targets/gfx1100/include/redscale_impl/common.h:40:

/opt/scale/targets/gfx1100/include/redscale_impl/../cuda.h:15:10: fatal error: 'cstddef' file not found

#include <cstddef>

         ^

1 error generated when compiling for gfx1100.
```

--------------------------------

### CUDA Stream Flags, ID, and Priority

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/api-driver.md

Functions to get the flags, unique ID, and priority of a CUDA stream. `cuStreamGetFlags` retrieves the stream's creation flags. `cuStreamGetId` returns the stream's unique identifier. `cuStreamGetPriority` gets the stream's priority.

```c
CUresult cuStreamGetFlags(CUstream hStream, unsigned int* flags);
CUresult cuStreamGetId(CUstream hStream, unsigned long long* streamId);
CUresult cuStreamGetPriority(CUstream hStream, int* priority);
```

--------------------------------

### Build CUDA Executable with PTX Compilation

Source: https://github.com/spectral-compute/scale-docs/blob/master/examples/src/ptx/CMakeLists.txt

This snippet demonstrates how to configure a CMake project to build an executable from a CUDA source file. It specifies the minimum required CMake version, the project name, and the language as CUDA. The `add_executable` command links the source file `ptx.cu` to the executable `example_ptx`.

```cmake
cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
project(example_ptx LANGUAGES CUDA)

add_executable(example_ptx ptx.cu)
```

--------------------------------

### Activate SCALE Environment and Build CUDA Project

Source: https://context7.com/spectral-compute/scale-docs/llms.txt

Activates a SCALE environment for transparent CUDA-to-AMD compilation using the `scaleenv` script. It then demonstrates building a CUDA project using standard CMake and Make commands after activating the environment for a specific GPU target.

```bash
# Identify your GPU target
scaleinfo | grep gfx
# Output: Device 0 (00:23:00.0): AMD Radeon Pro W6800 - gfx1030 (AMD) <amdgcn-amd-amdhsa--gfx1030>

# Activate SCALE environment for gfx1030
source /opt/scale/bin/scaleenv gfx1030

# Build your CUDA project using standard commands
mkdir build && cd build
cmake .. -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=ON
make

# Deactivate when done
deactivate

```

--------------------------------

### Asynchronous CUDA Operations with Streams and Events in C++

Source: https://context7.com/spectral-compute/scale-docs/llms.txt

This C++ example illustrates the use of CUDA streams for asynchronous operations and events for timing and synchronization. It demonstrates creating streams with and without priority, allocating pinned host memory for faster transfers, performing asynchronous memory copies and kernel launches, recording events to measure execution time, querying stream status, and cleaning up resources.

```cpp
#include <iostream>

__global__ void kernel(float* data, int n) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < n) data[idx] *= 2.0f;
}

int main() {
    const int N = 1024 * 1024;
    float *h_data, *d_data;

    // Allocate pinned host memory for async transfers
    cudaHostAlloc(&h_data, N * sizeof(float), cudaHostAllocDefault);
    cudaMalloc(&d_data, N * sizeof(float));

    // Create streams
    cudaStream_t stream1, stream2;
    cudaStreamCreate(&stream1);
    cudaStreamCreateWithPriority(&stream2, cudaStreamNonBlocking, -1);

    // Create events for timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Record start event
    cudaEventRecord(start, stream1);

    // Async memory copy and kernel launch
    cudaMemcpyAsync(d_data, h_data, N * sizeof(float),
                    cudaMemcpyHostToDevice, stream1);
    kernel<<<N/256, 256, 0, stream1>>>(d_data, N);
    cudaMemcpyAsync(h_data, d_data, N * sizeof(float),
                    cudaMemcpyDeviceToHost, stream1);

    // Record stop event
    cudaEventRecord(stop, stream1);

    // Wait for completion
    cudaEventSynchronize(stop);

    // Calculate elapsed time
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    std::cout << "Time: " << milliseconds << " ms" << std::endl;

    // Query stream status
    cudaError_t status = cudaStreamQuery(stream1);
    if (status == cudaSuccess) {
        std::cout << "Stream completed" << std::endl;
    }

    // Cleanup
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cudaStreamDestroy(stream1);
    cudaStreamDestroy(stream2);
    cudaHostFree(h_data);
    cudaFree(d_data);

    return 0;
}

```

--------------------------------

### Array Information and Manipulation (CUDA C++)

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/api-runtime.md

Functions for retrieving information about CUDA arrays and manipulating them. `cudaArrayGetInfo` retrieves the channel description and extent of a CUDA array. `cudaArrayGetPlane` gets a specific plane from a CUDA array. `cudaArrayGetSparseProperties` retrieves sparse properties. `cudaGetMipmappedArrayLevel` gets a specific level of a mipmapped array.

```cuda
__host__ cudaError_t cudaArrayGetInfo(cudaChannelFormatDesc* desc, cudaExtent* extent, unsigned int* flags, cudaArray_t array);
__host__ cudaError_t cudaArrayGetMemoryRequirements(cudaArrayMemoryRequirements* memoryRequirements, cudaArray_t array, int device);
__host__ cudaError_t cudaArrayGetPlane(cudaArray_t* pPlaneArray, cudaArray_t hArray, unsigned int planeIdx);
__host__ cudaError_t cudaArrayGetSparseProperties(cudaArraySparseProperties* sparseProperties, cudaArray_t array);
__host__ cudaError_t cudaGetMipmappedArrayLevel(cudaArray_t* levelArray, cudaMipmappedArray_const_t mipmappedArray, unsigned int level);
```

--------------------------------

### Graph Instantiation

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/api-driver.md

Instantiates a graph execution context from a graph definition.

```APIDOC
## Graph Instantiation

### Description
Creates a `CUgraphExec` handle from a `CUgraph` object, preparing it for execution.

### Method
`CUresult`

### Endpoints
- `cuGraphInstantiate`
- `cuGraphInstantiateWithParams`

### Parameters
#### Path Parameters
None

#### Query Parameters
None

#### Request Body
None (parameters are passed as function arguments)

### Request Example
None

### Response
#### Success Response (CU_SUCCESS)
- `CUresult` (CU_SUCCESS indicates success)
- `CUgraphExec*` (Pointer to the created graph execution handle)

#### Response Example
`CU_SUCCESS`
```

--------------------------------

### Node Type and Parameter Setting

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/api-driver.md

Functions to get the type of a node and to set generic node parameters.

```APIDOC
## Node Type and Parameter Setting

### Description
Functions to determine the type of a graph node and to set generic parameters for a node.

### Method
`CUresult`

### Endpoints
- `cuGraphNodeGetType`
- `cuGraphNodeSetParams`

### Parameters
#### Path Parameters
None

#### Query Parameters
None

#### Request Body
None (parameters are passed as function arguments)

### Request Example
None

### Response
#### Success Response (CU_SUCCESS)
- `CUresult` (CU_SUCCESS indicates success)

#### Response Example
`CU_SUCCESS`
```

--------------------------------

### Configure CMake Project for CUDA BLAS

Source: https://github.com/spectral-compute/scale-docs/blob/master/examples/src/blas/CMakeLists.txt

This snippet sets up the CMake build environment for a project named 'example_blas' that uses CUDA. It specifies the minimum required CMake version and the languages to be supported. The project then defines an executable 'example_blas' built from 'blas.cu' and links it against the 'cublas' and 'redscale' libraries.

```cmake
cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
project(example_blas LANGUAGES CUDA)

add_executable(example_blas blas.cu)
target_link_libraries(example_blas PRIVATE cublas redscale)
```

--------------------------------

### Launch Host Function (CUDA C)

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/api-driver.md

Launches a host function to be executed on the CUDA device. This allows for more flexible execution control and interop between host and device code.

```cuda-c
CUresult cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void* userData);
```

--------------------------------

### Launch Cooperative Kernel (CUDA C)

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/api-driver.md

Launches a cooperative kernel on the CUDA device. Cooperative kernels allow multiple blocks to synchronize and cooperate during execution.

```cuda-c
CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void** kernelParams);
```

--------------------------------

### External Semaphore Node Parameter Management

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/api-driver.md

Functions to get and set parameters for external semaphore signal and wait nodes.

```APIDOC
## External Semaphore Node Parameter Management

### Description
Functions for retrieving and setting parameters for nodes that signal or wait on external semaphores.

### Method
`CUresult`

### Endpoints
- `cuGraphExternalSemaphoresSignalNodeGetParams`
- `cuGraphExternalSemaphoresSignalNodeSetParams`
- `cuGraphExternalSemaphoresWaitNodeGetParams`
- `cuGraphExternalSemaphoresWaitNodeSetParams`

### Parameters
#### Path Parameters
None

#### Query Parameters
None

#### Request Body
None (parameters are passed as function arguments)

### Request Example
None

### Response
#### Success Response (CU_SUCCESS)
- `CUresult` (CU_SUCCESS indicates success)

#### Response Example
`CU_SUCCESS`
```

--------------------------------

### Memory Management in SCALE (C++)

Source: https://context7.com/spectral-compute/scale-docs/llms.txt

Demonstrates various memory allocation and transfer patterns using SCALE's memory management API, including device memory, unified memory, pinned host memory, and asynchronous operations. It also shows how to query memory information and pointer attributes.

```cpp
#include <iostream>

int main() {
    const size_t SIZE = 1024 * sizeof(float);

    // Standard device memory allocation
    float *d_ptr;
    cudaMalloc(&d_ptr, SIZE);
    cudaMemset(d_ptr, 0, SIZE);

    // Pitched memory for 2D arrays
    float *d_pitched;
    size_t pitch;
    cudaMallocPitch(&d_pitched, &pitch, 1024 * sizeof(float), 1024);

    // Managed (unified) memory
    float *managed;
    cudaMallocManaged(&managed, SIZE, cudaMemAttachGlobal);

    // Pinned host memory
    float *h_pinned;
    cudaHostAlloc(&h_pinned, SIZE, cudaHostAllocMapped);

    // Get device pointer from pinned host memory
    float *d_mapped;
    cudaHostGetDevicePointer(&d_mapped, h_pinned, 0);

    // Async memory operations with streams
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    cudaMallocAsync(&d_ptr, SIZE, stream);
    cudaFreeAsync(d_ptr, stream);

    // Memory info
    size_t free_mem, total_mem;
    cudaMemGetInfo(&free_mem, &total_mem);
    std::cout << "Free: " << free_mem / (1024*1024) << " MB, "
              << "Total: " << total_mem / (1024*1024) << " MB" << std::endl;

    // Pointer attributes query
    cudaPointerAttributes attrs;
    cudaPointerGetAttributes(&attrs, managed);
    std::cout << "Memory type: " << attrs.type << std::endl;

    // 2D memory copy
    cudaMemcpy2D(d_pitched, pitch, h_pinned, 1024 * sizeof(float),
                 1024 * sizeof(float), 1024, cudaMemcpyHostToDevice);

    // Peer-to-peer access
    int canAccessPeer;
    cudaDeviceCanAccessPeer(&canAccessPeer, 0, 1);
    if (canAccessPeer) {
        cudaDeviceEnablePeerAccess(1, 0);
    }

    // Cleanup
    cudaFree(d_pitched);
    cudaFree(managed);
    cudaFreeHost(h_pinned);
    cudaStreamDestroy(stream);

    return 0;
}

```

--------------------------------

### Fix nvcc: cannot find libdevice or CUDA installation

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/troubleshooting.md

This error arises when nvcc cannot locate the necessary 'libdevice' files for a specific GPU architecture (e.g., sm_52) or cannot find the CUDA installation directory. This often happens when a generic target like 'gfxany' is used instead of a specific one. Specify a target-specific directory or use the '-arch' flag with a specific target.

```bash
nvcc: error: cannot find libdevice for sm_52; provide path to different CUDA installation via '--cuda-path', or pass '-nocudalib' to build without linking with libdevice
nvcc: error: cannot find CUDA installation; provide its path via '--cuda-path', or pass '-nocudainc' to build without CUDA includes
```

--------------------------------

### CUDA Graph Node Get Type

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/api-runtime.md

Retrieves the type of a CUDA graph node.

```APIDOC
## cudaGraphNodeGetType

### Description
Retrieves the type of a CUDA graph node.

### Method
`__host__`

### Endpoint
`cudaGraphNodeGetType(cudaGraphNode_t node, cudaGraphNodeType ** pType)`

### Parameters
#### Path Parameters
None

#### Query Parameters
None

#### Request Body
- **node** (`cudaGraphNode_t`) - The handle of the node.
- **pType** (`cudaGraphNodeType **`) - Pointer to store the node type.

### Request Example
```json
{
  "node": "node_handle",
  "pType": "output_type_pointer"
}
```

### Response
#### Success Response (0)
- **cudaError_t** - Returns `cudaSuccess` on success.

#### Response Example
```json
{
  "status": "cudaSuccess"
}
```
```

--------------------------------

### Graph Launch

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/api-driver.md

Launches a CUDA graph execution on a specified stream.

```APIDOC
## Graph Launch

### Description
Executes a previously instantiated CUDA graph on a given CUDA stream.

### Method
`CUresult`

### Endpoint
`cuGraphLaunch`

### Parameters
#### Path Parameters
None

#### Query Parameters
None

#### Request Body
None (parameters are passed as function arguments)

### Request Example
None

### Response
#### Success Response (CU_SUCCESS)
- `CUresult` (CU_SUCCESS indicates success)

#### Response Example
`CU_SUCCESS`
```

--------------------------------

### CUDA Graph Node Get Dependencies

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/api-runtime.md

Retrieves the dependencies of a node in a CUDA graph.

```APIDOC
## cudaGraphNodeGetDependencies

### Description
Retrieves the dependencies of a node in a CUDA graph.

### Method
`__host__`

### Endpoint
`cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, cudaGraphEdgeData* edgeData, size_t* pNumDependencies)`

### Parameters
#### Path Parameters
None

#### Query Parameters
None

#### Request Body
- **node** (`cudaGraphNode_t`) - The handle of the node.
- **pDependencies** (`cudaGraphNode_t*`) - Pointer to an array to store the handles of dependent nodes.
- **edgeData** (`cudaGraphEdgeData*`) - Pointer to an array to store edge data.
- **pNumDependencies** (`size_t*`) - Pointer to store the number of dependencies.

### Request Example
```json
{
  "node": "node_handle",
  "pDependencies": "output_dependencies_pointer",
  "edgeData": "output_edge_data_pointer",
  "pNumDependencies": "output_num_dependencies_pointer"
}
```

### Response
#### Success Response (0)
- **cudaError_t** - Returns `cudaSuccess` on success.

#### Response Example
```json
{
  "status": "cudaSuccess"
}
```
```

--------------------------------

### CUDA Device Enumeration and Configuration in C++

Source: https://context7.com/spectral-compute/scale-docs/llms.txt

This C++ code snippet demonstrates how to use the CUDA Device Management API to enumerate available GPU devices, query their properties (name, compute capability, memory, multiprocessors, max threads per block), set the active device, and retrieve specific device attributes. It also shows how to get the stream priority range and synchronize/reset the device.

```cpp
#include <iostream>

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);

    for (int i = 0; i < deviceCount; i++) {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, i);

        std::cout << "Device " << i << ": " << prop.name << std::endl;
        std::cout << "  Compute capability: " << prop.major << "."
                  << prop.minor << std::endl;
        std::cout << "  Total memory: " << prop.totalGlobalMem / (1024*1024)
                  << " MB" << std::endl;
        std::cout << "  Multiprocessors: " << prop.multiProcessorCount << std::endl;
        std::cout << "  Max threads per block: " << prop.maxThreadsPerBlock << std::endl;
    }

    // Set active device
    cudaSetDevice(0);

    // Query device attributes
    int maxThreads;
    cudaDeviceGetAttribute(&maxThreads, cudaDevAttrMaxThreadsPerBlock, 0);

    // Get stream priority range
    int leastPriority, greatestPriority;
    cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);

    // Synchronize device
    cudaDeviceSynchronize();

    // Reset device state
    cudaDeviceReset();

    return 0;
}

```

--------------------------------

### CUDA Graph Node Get Dependent Nodes

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/api-runtime.md

Retrieves the nodes that depend on a given node in a CUDA graph.

```APIDOC
## cudaGraphNodeGetDependentNodes

### Description
Retrieves the nodes that depend on a given node in a CUDA graph.

### Method
`__host__`

### Endpoint
`cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, cudaGraphEdgeData* edgeData, size_t* pNumDependentNodes)`

### Parameters
#### Path Parameters
None

#### Query Parameters
None

#### Request Body
- **node** (`cudaGraphNode_t`) - The handle of the node.
- **pDependentNodes** (`cudaGraphNode_t*`) - Pointer to an array to store the handles of dependent nodes.
- **edgeData** (`cudaGraphEdgeData*`) - Pointer to an array to store edge data.
- **pNumDependentNodes** (`size_t*`) - Pointer to store the number of dependent nodes.

### Request Example
```json
{
  "node": "node_handle",
  "pDependentNodes": "output_dependent_nodes_pointer",
  "edgeData": "output_edge_data_pointer",
  "pNumDependentNodes": "output_num_dependent_nodes_pointer"
}
```

### Response
#### Success Response (0)
- **cudaError_t** - Returns `cudaSuccess` on success.

#### Response Example
```json
{
  "status": "cudaSuccess"
}
```
```

--------------------------------

### Graph Upload

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/api-driver.md

Uploads a graph execution context to the device.

```APIDOC
## Graph Upload

### Description
Uploads the graph execution context to the device, making it ready for launch.

### Method
`CUresult`

### Endpoint
`cuGraphUpload`

### Parameters
#### Path Parameters
None

#### Query Parameters
None

#### Request Body
None (parameters are passed as function arguments)

### Request Example
None

### Response
#### Success Response (CU_SUCCESS)
- `CUresult` (CU_SUCCESS indicates success)

#### Response Example
`CU_SUCCESS`
```

--------------------------------

### Get Peer-to-Peer Attribute (CUDA)

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/api-driver.md

Retrieves specific peer-to-peer attributes between two CUDA devices. This can include information about supported atomic operations or other P2P capabilities.

```cuda
CUresult cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice);
```

--------------------------------

### SCALE Language Extensions for GPU Programming

Source: https://context7.com/spectral-compute/scale-docs/llms.txt

Showcases SCALE's opt-in language extensions designed to enhance GPU programming, including template-dependent loop unrolling for automatic unrolling and a provable builtin for compile-time optimization hints. It also demonstrates portable warp-size handling and getter attributes for thread indexing.

```cpp
// Template-dependent loop unrolling (alternative to #pragma unroll)
template<int UnrollAmount>
__device__ void processArray(float* data, int n) {
    [[clang::loop_unroll UnrollAmount]]
    for (int i = 0; i < n; i++) {
        data[i] = data[i] * 2.0f;
    }
}

// Optimisation hint for compile-time provable conditions
__device__ int optimizedDivide(int input) {
    if (__builtin_provable(input % 2 == 0)) {
        // Fast path: compiler proved input is even
        return input >> 1;
    } else {
        // Slow path: general case
        return input / 2;
    }
}

// Portable warp-size handling with cudaLaneMask_t
__device__ void portableBallot() {
    // Use auto for ballot return type - works on all platforms
    auto mask = __ballot(1);

    // Or explicitly use cudaLaneMask_t (SCALE extension)
    #ifdef __REDSCALE__
    cudaLaneMask_t explicit_mask = __ballot(1);
    #endif
}

// Getter attribute for implementing thread indexing
// [[clang::getter(someFunction)]] int example;
// All references to 'example' become calls to someFunction()

```

--------------------------------

### Get CUDA Graph Node Type

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/api-runtime.md

Retrieves the type of a CUDA graph node. This function is intended for host-side operations.

```cuda
__host__ cudaError_t cudaGraphNodeGetType(cudaGraphNode_t node, cudaGraphNodeType ** pType);
```

--------------------------------

### CUDA Green Context Management Functions (C)

Source: https://github.com/spectral-compute/scale-docs/blob/master/docs/manual/api-driver.md

Functions for creating, destroying, and managing CUDA Green Contexts. These contexts allow for more flexible resource management and efficient GPU utilization. They involve device resource descriptions and event recording/waiting.

```c
CUresult cuCtxFromGreenCtx(CUcontext* pContext, CUgreenCtx hCtx);
CUresult cuCtxGetDevResource(CUcontext hCtx, CUdevResource* resource, CUdevResourceType type);
CUresult cuDevResourceGenerateDesc(CUdevResourceDesc* phDesc, CUdevResource* resources, unsigned int nbResources);
CUresult cuDevSmResourceSplitByCount(CUdevResource* result, unsigned int* nbGroups, const CUdevResource* input, CUdevResource* remaining, unsigned int useFlags, unsigned int minCount);
CUresult cuDeviceGetDevResource(CUdevice device, CUdevResource* resource, CUdevResourceType type);
CUresult cuGreenCtxCreate(CUgreenCtx* phCtx, CUdevResourceDesc desc, CUdevice dev, unsigned int flags);
CUresult cuGreenCtxDestroy(CUgreenCtx hCtx);
CUresult cuGreenCtxGetDevResource(CUgreenCtx hCtx, CUdevResource* resource, CUdevResourceType type);
CUresult cuGreenCtxGetId(CUgreenCtx greenCtx, unsigned long long* greenCtxId);
CUresult cuGreenCtxRecordEvent(CUgreenCtx hCtx, CUevent hEvent);
CUresult cuGreenCtxStreamCreate(CUstream* phStream, CUgreenCtx greenCtx, unsigned int flags, int priority);
CUresult cuGreenCtxWaitEvent(CUgreenCtx hCtx, CUevent hEvent);
CUresult cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx* phCtx);
```