# ===----------------------------------------------------------------------=== #
# Copyright (c) 2025, Modular Inc. All rights reserved.
#
# Licensed under the Apache License v2.0 with LLVM Exceptions:
# https://llvm.org/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ===----------------------------------------------------------------------=== #

# DOC: mojo/docs/manual/layout/tensors.mdx

from layout import (
    IntTuple,
    Layout,
    LayoutTensor,
    print_layout,
    UNKNOWN_VALUE,
    RuntimeLayout,
    RuntimeTuple,
)
from math import ceildiv
from collections import Set, InlineArray

from layout.layout_tensor import LayoutTensorIter, _compute_distribute_layout
from layout.layout import (
    tile_to_shape,
    blocked_product,
    crd2idx,
    idx2crd,
    coalesce,
)
from layout.int_tuple import flatten
from memory import UnsafePointer, memset
from testing import assert_equal
from utils import Index, IndexList


def accessing_tensor_elements_example():
    comptime rows = 4
    comptime columns = 8
    comptime layout = Layout.row_major(rows, columns)
    var storage = InlineArray[Float32, rows * columns](uninitialized=True)
    for i in range(rows * columns):
        storage[i] = i
    var tensor = LayoutTensor[DType.float32, layout](storage)

    var row, col = 0, 1
    # start-access-example-1
    var element = tensor[row, col][
        0
    ]  # element is guaranteed to be a scalar value
    # end-access-example-1

    assert_equal(element, 1)
    row, col = 0, 0

    # start-access-example-2
    var elements = tensor.load[4](row, col)
    elements = elements * 2
    tensor.store(row, col, elements)
    # end-access-example-2

    element = tensor[0, 2][0]
    assert_equal(element, 4)


def accessing_nested_tensor_elements_example():
    comptime rows = 4
    comptime columns = 6
    comptime tiler = Layout.row_major(2, 3)
    comptime layout = blocked_product(Layout.col_major(2, 2), tiler)
    var storage = InlineArray[Float32, rows * columns](uninitialized=True)
    for i in range(rows * columns):
        storage[i] = i
    var tensor = LayoutTensor[DType.float32, layout](storage)

    # start-access-nested-tensor-example
    var element = tensor[1, 0, 0, 1][0]
    # end-access-nested-tensor-example

    assert_equal(element, 5)


def layout_tensor_on_cpu_example():
    # start-layout-tensor-on-cpu-example
    comptime rows = 8
    comptime columns = 16
    comptime layout = Layout.row_major(rows, columns)
    var storage = InlineArray[Float32, rows * columns](uninitialized=True)
    var tensor = LayoutTensor[DType.float32, layout](storage)
    # end-layout-tensor-on-cpu-example
    assert_equal(tensor.size(), rows * columns)
    _ = tensor


def layout_tensor_from_pointer_example():
    # start-layout-tensor-from-pointer-example
    comptime rows = 1024
    comptime columns = 1024
    comptime buf_size = rows * columns
    comptime layout = Layout.row_major(rows, columns)
    var ptr = alloc[Float32](buf_size)
    memset(ptr, 0, buf_size)
    var tensor = LayoutTensor[DType.float32, layout](ptr)
    # end-layout-tensor-from-pointer-example
    assert_equal(tensor.size(), rows * columns)
    _ = tensor


def layout_tensor_tile_example():
    # start-layout-tensor-tile-example
    comptime rows = 2
    comptime columns = 4
    comptime tile_size = 32
    comptime tile_layout = Layout.row_major(tile_size, tile_size)
    comptime tiler_layout = Layout.row_major(rows, columns)
    comptime tiled_layout = blocked_product(tile_layout, tiler_layout)
    var storage = InlineArray[Float32, tiled_layout.size()](uninitialized=True)
    for i in range(tiled_layout.size()):
        storage[i] = i
    var tensor = LayoutTensor[DType.float32, tiled_layout](storage)
    var tile = tensor.tile[32, 32](0, 1)
    # end-layout-tensor-tile-example
    assert_equal(tile[0, 0][0], Float32(tile_size * tile_size))
    # start-layout-tensor-tile-example-2
    var my_tile: tensor.TileType[tile_size, tile_size]
    for i in range(rows):
        for j in range(columns):
            my_tile = tensor.tile[tile_size, tile_size](i, j)
            # ... do something with the tile ...
            # end-layout-tensor-tile-example-2
            _ = my_tile


# Iterates through a block of memory one tile at a time.
# This essentially treats the memory as a flat array of
# tiles (or a 2D row-major matrix of tiles).
def layout_tensor_iterator_example():
    # start-layout-tensor-iterator-example-1
    comptime buf_size = 128
    var storage = InlineArray[Int16, buf_size](uninitialized=True)
    for i in range(buf_size):
        storage[i] = i
    comptime tile_layout = Layout.row_major(4, 4)
    var iter = LayoutTensorIter[DType.int16, tile_layout, MutAnyOrigin](
        storage.unsafe_ptr(), buf_size
    )

    for i in range(ceildiv(buf_size, tile_layout.size())):
        var tile = iter[]
        # ... do something with tile
        iter += 1
        # end-layout-tensor-iterator-example-1
        assert_equal(tile[0, 0][0], i * tile_layout.size())


def layout_tensor_iterator_example2():
    # TODO: set up a tiled layout tensor as input and
    # validate output.
    comptime rows = 4
    comptime cols = 8
    comptime size = rows * cols
    comptime tile_size = 2
    var storage = InlineArray[Int32, size](uninitialized=True)
    for i in range(size):
        storage[i] = Int32(i)

    comptime layout = Layout.row_major(rows, cols)
    var tensor = LayoutTensor[DType.int32, layout, masked=True](storage)
    # start-layout-tensor-iterator-example-2
    # given a tensor of size rows x cols
    comptime num_row_tiles = ceildiv(rows, tile_size)
    comptime num_col_tiles = ceildiv(cols, tile_size)

    for i in range(num_row_tiles):
        var iter = tensor.tiled_iterator[tile_size, tile_size, axis=1](i, 0)

        for _ in range(num_col_tiles):
            var tile = iter[]
            # ... do something with the tile
            iter += 1
            # end-layout-tensor-iterator-example-2
            _ = tile


def main():
    accessing_tensor_elements_example()
    accessing_nested_tensor_elements_example()
    layout_tensor_on_cpu_example()
    layout_tensor_from_pointer_example()
    layout_tensor_tile_example()
    layout_tensor_iterator_example()
    layout_tensor_iterator_example2()