mirror of
https://github.com/johndoe6345789/metabuilder.git
synced 2026-04-25 06:14:59 +00:00
76 lines
2.7 KiB
Mojo
Executable File
76 lines
2.7 KiB
Mojo
Executable File
# ===----------------------------------------------------------------------=== #
|
|
# Copyright (c) 2025, Modular Inc. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License v2.0 with LLVM Exceptions:
|
|
# https://llvm.org/LICENSE.txt
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# ===----------------------------------------------------------------------=== #
|
|
|
|
from math import ceildiv
|
|
from sys import has_accelerator
|
|
|
|
from gpu import global_idx
|
|
from gpu.host import DeviceContext
|
|
from layout import Layout, LayoutTensor
|
|
|
|
comptime float_dtype = DType.float32
|
|
comptime VECTOR_WIDTH = 10
|
|
comptime BLOCK_SIZE = 5
|
|
comptime layout = Layout.row_major(VECTOR_WIDTH)
|
|
|
|
|
|
def main():
|
|
constrained[has_accelerator(), "This example requires a supported GPU"]()
|
|
|
|
# Get context for the attached GPU
|
|
var ctx = DeviceContext()
|
|
|
|
# Allocate data on the GPU address space
|
|
var lhs_buffer = ctx.enqueue_create_buffer[float_dtype](VECTOR_WIDTH)
|
|
var rhs_buffer = ctx.enqueue_create_buffer[float_dtype](VECTOR_WIDTH)
|
|
var out_buffer = ctx.enqueue_create_buffer[float_dtype](VECTOR_WIDTH)
|
|
|
|
# Fill in values across the entire width
|
|
lhs_buffer.enqueue_fill(1.25)
|
|
rhs_buffer.enqueue_fill(2.5)
|
|
|
|
# Wrap the device buffers in tensors
|
|
var lhs_tensor = LayoutTensor[float_dtype, layout](lhs_buffer)
|
|
var rhs_tensor = LayoutTensor[float_dtype, layout](rhs_buffer)
|
|
var out_tensor = LayoutTensor[float_dtype, layout](out_buffer)
|
|
|
|
# Calculate the number of blocks needed to cover the vector
|
|
var grid_dim = ceildiv(VECTOR_WIDTH, BLOCK_SIZE)
|
|
|
|
# Launch the vector_addition function as a GPU kernel
|
|
ctx.enqueue_function[vector_addition, vector_addition](
|
|
lhs_tensor,
|
|
rhs_tensor,
|
|
out_tensor,
|
|
VECTOR_WIDTH,
|
|
grid_dim=grid_dim,
|
|
block_dim=BLOCK_SIZE,
|
|
)
|
|
|
|
# Map to host so that values can be printed from the CPU
|
|
with out_buffer.map_to_host() as host_buffer:
|
|
var host_tensor = LayoutTensor[float_dtype, layout](host_buffer)
|
|
print("Resulting vector:", host_tensor)
|
|
|
|
|
|
fn vector_addition(
|
|
lhs_tensor: LayoutTensor[float_dtype, layout, MutAnyOrigin],
|
|
rhs_tensor: LayoutTensor[float_dtype, layout, MutAnyOrigin],
|
|
out_tensor: LayoutTensor[float_dtype, layout, MutAnyOrigin],
|
|
size: Int,
|
|
):
|
|
"""The calculation to perform across the vector on the GPU."""
|
|
var global_tid = global_idx.x
|
|
if global_tid < UInt(size):
|
|
out_tensor[global_tid] = lhs_tensor[global_tid] + rhs_tensor[global_tid]
|