diff --git a/kernel/src/apic.cpp b/kernel/src/apic.cpp
index 10bac9e..3392ffb 100644
--- a/kernel/src/apic.cpp
+++ b/kernel/src/apic.cpp
@@ -1,8 +1,9 @@
-/*
- * MetalOS Kernel - APIC (Advanced Programmable Interrupt Controller)
+/**
+ * @file apic.cpp
+ * @brief Implementation of APIC (Advanced Programmable Interrupt Controller) manager
  * 
- * Local APIC support for multicore systems
- * Replaces legacy PIC for per-CPU interrupt handling
+ * The Local APIC is a key component of modern x86-64 multicore systems. It replaces
+ * the legacy 8259 PIC and provides per-CPU interrupt handling capabilities.
  */
 
 #include "kernel/apic.h"
@@ -10,7 +11,14 @@
 // APIC base address (default, can be read from MSR)
 #define APIC_BASE_MSR 0x1B
 
-// Read CPUID to check for APIC
+/**
+ * @brief Check if CPU has APIC support using CPUID instruction
+ * 
+ * The CPUID instruction provides information about CPU features. Function 1
+ * returns feature flags in the EDX register, where bit 9 indicates APIC support.
+ * 
+ * @return true if APIC is supported, false otherwise
+ */
 static bool cpuidHasAPIC(void) {
     uint32_t eax, ebx, ecx, edx;
     
@@ -25,13 +33,37 @@ static bool cpuidHasAPIC(void) {
     return (edx & (1 << 9)) != 0;
 }
 
-// APIC class implementation
+/* APIC class implementation */
+
+/**
+ * @brief Constructor - sets APIC base address to default memory-mapped location
+ * 
+ * The Local APIC registers are accessed through memory-mapped I/O at physical
+ * address 0xFEE00000 by default. This can be changed via the IA32_APIC_BASE MSR,
+ * but we use the default location for simplicity.
+ */
 APIC::APIC() : apicBase((volatile uint32_t*)0xFEE00000) {}
 
+/**
+ * @brief Read a 32-bit value from an APIC register
+ * 
+ * APIC registers are 32 bits wide and located at 16-byte aligned offsets.
+ * The apicBase pointer is to uint32_t, so we divide the offset by 4 to get
+ * the array index.
+ * 
+ * @param offset Register offset in bytes (e.g., 0x020 for APIC ID register)
+ * @return 32-bit register value
+ */
 uint32_t APIC::read(uint32_t offset) const {
     return apicBase[offset / 4];
 }
 
+/**
+ * @brief Write a 32-bit value to an APIC register
+ * 
+ * @param offset Register offset in bytes
+ * @param value 32-bit value to write
+ */
 void APIC::write(uint32_t offset, uint32_t value) {
     apicBase[offset / 4] = value;
 }
@@ -40,6 +72,17 @@ bool APIC::isAvailable() const {
     return cpuidHasAPIC();
 }
 
+/**
+ * @brief Initialize the Local APIC for this CPU core
+ * 
+ * This function:
+ * 1. Enables the APIC by setting the software enable bit (bit 8) in the
+ *    Spurious Interrupt Vector Register
+ * 2. Sets the spurious vector to 0xFF (unused vector for spurious interrupts)
+ * 3. Sets Task Priority Register to 0 to accept all interrupt priorities
+ * 
+ * After this initialization, the APIC is ready to receive and send interrupts.
+ */
 void APIC::init() {
     // Enable APIC via spurious interrupt vector register
     // Set spurious vector to 0xFF and enable APIC (bit 8)
@@ -49,15 +92,52 @@ void APIC::init() {
     write(APIC_REG_TPR, 0);
 }
 
+/**
+ * @brief Get the APIC ID of the current CPU core
+ * 
+ * The APIC ID is stored in bits 24-31 of the APIC ID register. This is a
+ * unique identifier for each Local APIC (and thus each CPU core).
+ * 
+ * @return 8-bit APIC ID
+ * @note APIC IDs may not be sequential (e.g., 0, 2, 4, 6 on hyperthreaded systems)
+ */
 uint8_t APIC::getId() const {
     uint32_t idReg = read(APIC_REG_ID);
     return (idReg >> 24) & 0xFF;
 }
 
+/**
+ * @brief Send End-Of-Interrupt signal to acknowledge interrupt completion
+ * 
+ * After handling an interrupt that came through the APIC, the interrupt handler
+ * must send an EOI to inform the APIC that the interrupt has been processed.
+ * This allows the APIC to deliver the next interrupt if one is pending.
+ * 
+ * Writing any value (typically 0) to the EOI register sends the EOI.
+ */
 void APIC::sendEOI() {
     write(APIC_REG_EOI, 0);
 }
 
+/**
+ * @brief Send an Inter-Processor Interrupt (IPI) to another CPU core
+ * 
+ * IPIs are used for:
+ * - Starting Application Processors (APs) during SMP initialization (INIT + SIPI)
+ * - Sending signals or notifications to other cores
+ * - TLB shootdowns when changing page tables
+ * - Requesting other cores to perform specific actions
+ * 
+ * The IPI is sent using the Interrupt Command Register (ICR), which consists of
+ * two 32-bit registers (high and low). The high register contains the destination
+ * APIC ID, and the low register contains the delivery mode, vector, and control flags.
+ * 
+ * @param destApicId APIC ID of the destination CPU core
+ * @param vector Interrupt vector number (or page number for SIPI)
+ * @param deliveryMode Delivery mode (INIT, SIPI, fixed, etc.)
+ * 
+ * @note This function waits for any pending IPI to complete before sending a new one
+ */
 void APIC::sendIPI(uint8_t destApicId, uint8_t vector, uint32_t deliveryMode) {
     // Wait for previous IPI to complete
     while (read(APIC_REG_ICR_LOW) & (1 << 12)) {
diff --git a/kernel/src/gdt.cpp b/kernel/src/gdt.cpp
index 6100ca9..fc9cae9 100644
--- a/kernel/src/gdt.cpp
+++ b/kernel/src/gdt.cpp
@@ -1,8 +1,9 @@
-/*
- * MetalOS Kernel - Global Descriptor Table (GDT)
+/**
+ * @file gdt.cpp
+ * @brief Implementation of Global Descriptor Table (GDT) manager
  * 
- * Minimal GDT setup for x86_64 long mode
- * Only what's needed for our single-app OS
+ * The GDT is required by x86-64 processors even though segmentation is largely
+ * disabled in 64-bit mode. It defines code and data segments with privilege levels.
  */
 
 #include "kernel/gdt.h"
@@ -10,12 +11,40 @@
 // Load GDT (assembly)
 extern "C" void gdt_flush(uint64_t);
 
-// GDT class implementation
+/* GDT class implementation */
+
+/**
+ * @brief Constructor - initializes GDT pointer structure
+ * 
+ * Sets up the GDTR (GDT Register) structure that will be loaded into the CPU.
+ * The limit is the size of the GDT minus 1, and the base is the memory address.
+ */
 GDT::GDT() {
     gdtPtr.limit = (sizeof(gdt_entry_t) * 5) - 1;
     gdtPtr.base = (uint64_t)&entries;
 }
 
+/**
+ * @brief Set a GDT entry with specified parameters
+ * 
+ * Fills in all fields of a GDT entry. In 64-bit mode, the base and limit are
+ * largely ignored, but the access flags (privilege level, executable) are enforced.
+ * 
+ * @param num Entry index (0-4)
+ * @param base Base address (mostly ignored in 64-bit mode)
+ * @param limit Segment limit (mostly ignored in 64-bit mode)
+ * @param access Access byte containing:
+ *               - Bit 7: Present (must be 1 for valid segment)
+ *               - Bits 5-6: DPL (Descriptor Privilege Level): 0=kernel, 3=user
+ *               - Bit 4: Descriptor type (1 for code/data)
+ *               - Bit 3: Executable (1 for code, 0 for data)
+ *               - Bit 1: Readable/Writable
+ * @param gran Granularity byte containing:
+ *             - Bit 7: Granularity (1=4KB blocks, 0=1 byte blocks)
+ *             - Bit 6: Size (1=32-bit, 0=16-bit; for 64-bit use access flags)
+ *             - Bit 5: Long mode (1=64-bit code segment)
+ *             - Bits 0-3: Upper 4 bits of limit
+ */
 void GDT::setGate(int num, uint32_t base, uint32_t limit, uint8_t access, uint8_t gran) {
     entries[num].base_low = (base & 0xFFFF);
     entries[num].base_middle = (base >> 16) & 0xFF;
@@ -27,6 +56,30 @@ void GDT::setGate(int num, uint32_t base, uint32_t limit, uint8_t access, uint8_
     entries[num].access = access;
 }
 
+/**
+ * @brief Initialize the GDT with required segments and load it
+ * 
+ * Sets up a minimal GDT with 5 entries:
+ * 0. Null descriptor (required by CPU, must be all zeros)
+ * 1. Kernel code segment (CPL 0, 64-bit, executable, readable)
+ * 2. Kernel data segment (CPL 0, 64-bit, writable)
+ * 3. User code segment (CPL 3, 64-bit, executable, readable)
+ * 4. User data segment (CPL 3, 64-bit, writable)
+ * 
+ * After setting up the entries, calls gdt_flush() assembly function to:
+ * - Load GDTR using LGDT instruction
+ * - Reload segment registers with new selectors
+ * 
+ * Access byte values:
+ * - 0x9A = 10011010 = Present, Ring 0, Code, Executable, Readable
+ * - 0x92 = 10010010 = Present, Ring 0, Data, Writable
+ * - 0xFA = 11111010 = Present, Ring 3, Code, Executable, Readable
+ * - 0xF2 = 11110010 = Present, Ring 3, Data, Writable
+ * 
+ * Granularity values:
+ * - 0xA0 = 10100000 = Long mode (64-bit)
+ * - 0xC0 = 11000000 = 32-bit mode (for data segments in 64-bit mode)
+ */
 void GDT::init() {
     // Null descriptor
     setGate(0, 0, 0, 0, 0);
diff --git a/kernel/src/interrupts.cpp b/kernel/src/interrupts.cpp
index b2fc199..c392dd0 100644
--- a/kernel/src/interrupts.cpp
+++ b/kernel/src/interrupts.cpp
@@ -1,8 +1,9 @@
-/*
- * MetalOS Kernel - Interrupt Handling
+/**
+ * @file interrupts.cpp
+ * @brief Implementation of interrupt descriptor table and interrupt handling
  * 
- * Minimal IDT and interrupt handlers
- * Supports both PIC (legacy) and APIC (multicore) modes
+ * Manages CPU exceptions and hardware interrupts through the IDT.
+ * Supports both legacy PIC and modern APIC interrupt controllers.
  */
 
 #include "kernel/interrupts.h"
@@ -10,7 +11,11 @@
 #include "kernel/smp.h"
 #include "kernel/apic.h"
 
-// I/O port access functions
+/**
+ * @brief Write a byte to an I/O port
+ * @param port I/O port address
+ * @param value Byte value to write
+ */
 static inline void outb(uint16_t port, uint8_t value) {
     __asm__ volatile("outb %0, %1" : : "a"(value), "Nd"(port));
 }
@@ -34,12 +39,34 @@ extern "C" {
     void irq0(void); void irq1(void);
 }
 
-// InterruptManager class implementation
+/* InterruptManager class implementation */
+
+/**
+ * @brief Constructor - initializes IDT pointer structure
+ */
 InterruptManager::InterruptManager() {
     idtPtr.limit = (sizeof(idt_entry_t) * 256) - 1;
     idtPtr.base = (uint64_t)&idt;
 }
 
+/**
+ * @brief Set an IDT entry to point to an interrupt handler
+ * 
+ * In 64-bit mode, IDT entries are 16 bytes and contain:
+ * - 64-bit handler address (split across three fields)
+ * - 16-bit code segment selector
+ * - Type and attributes (present, DPL, gate type)
+ * - IST (Interrupt Stack Table) offset (usually 0)
+ * 
+ * @param num Interrupt vector number (0-255)
+ * @param handler Address of interrupt handler function
+ * @param selector Code segment selector (0x08 for kernel code)
+ * @param flags Type and attribute byte:
+ *              - Bit 7: Present (1)
+ *              - Bits 5-6: DPL (0 for kernel)
+ *              - Bits 0-4: Gate type (0xE for interrupt gate)
+ *              Common value: 0x8E = Present, DPL=0, Interrupt Gate
+ */
 void InterruptManager::setGate(uint8_t num, uint64_t handler, uint16_t selector, uint8_t flags) {
     idt[num].offset_low = handler & 0xFFFF;
     idt[num].offset_mid = (handler >> 16) & 0xFFFF;
@@ -50,6 +77,26 @@ void InterruptManager::setGate(uint8_t num, uint64_t handler, uint16_t selector,
     idt[num].zero = 0;
 }
 
+/**
+ * @brief Remap the 8259 PIC to avoid conflicts with CPU exceptions
+ * 
+ * By default, the PIC uses IRQ vectors 0-15, which overlap with CPU exception
+ * vectors 0-31. This causes confusion when a hardware interrupt has the same
+ * vector as a CPU exception (e.g., IRQ 8 vs Double Fault exception 8).
+ * 
+ * We remap the PIC so that:
+ * - Master PIC (IRQ 0-7) → vectors 32-39
+ * - Slave PIC (IRQ 8-15) → vectors 40-47
+ * 
+ * The remapping process uses ICW (Initialization Command Words):
+ * - ICW1: Start initialization (0x11 = ICW4 needed, cascade mode)
+ * - ICW2: Set vector offset (0x20 for master, 0x28 for slave)
+ * - ICW3: Set up cascade (master: slave on IRQ2, slave: cascade identity)
+ * - ICW4: Set 8086 mode
+ * 
+ * After remapping, all IRQs are masked (disabled) initially. Individual IRQs
+ * must be explicitly unmasked to receive interrupts.
+ */
 void InterruptManager::remapPIC() {
     // ICW1: Initialize PIC
     outb(PIC1_COMMAND, 0x11);
@@ -72,6 +119,30 @@ void InterruptManager::remapPIC() {
     outb(PIC2_DATA, 0xFF);
 }
 
+/**
+ * @brief Initialize the IDT and enable interrupts
+ * 
+ * This function performs complete interrupt subsystem initialization:
+ * 1. Clear all 256 IDT entries
+ * 2. Install exception handlers (ISR 0-31) for CPU exceptions
+ * 3. Remap the PIC to avoid conflicts
+ * 4. Install IRQ handlers (32-47) for hardware interrupts
+ * 5. Load IDT using LIDT instruction
+ * 6. Enable interrupts using STI instruction
+ * 
+ * CPU exceptions (0-31) include:
+ * - 0: Divide by zero
+ * - 6: Invalid opcode
+ * - 13: General protection fault
+ * - 14: Page fault
+ * etc.
+ * 
+ * Hardware IRQs (32-47) include:
+ * - 32 (IRQ 0): Timer
+ * - 33 (IRQ 1): Keyboard
+ * - 44 (IRQ 12): PS/2 Mouse
+ * etc.
+ */
 void InterruptManager::init() {
     // Clear IDT
     for (int i = 0; i < 256; i++) {
@@ -126,6 +197,25 @@ void InterruptManager::init() {
     __asm__ volatile("sti");
 }
 
+/**
+ * @brief Main interrupt handler dispatcher
+ * 
+ * This function is called from the assembly interrupt stubs (ISRs/IRQs).
+ * It receives the saved CPU state and dispatches to specific handlers
+ * based on the interrupt number.
+ * 
+ * Process:
+ * 1. Check interrupt number
+ * 2. Call specific handler if needed (e.g., timer for IRQ 0)
+ * 3. Send End-Of-Interrupt signal to PIC or APIC
+ * 
+ * For hardware IRQs (32-47):
+ * - Check if using APIC (multicore) or PIC (legacy)
+ * - Send EOI to appropriate controller
+ * - For slave PIC IRQs (40-47), must send EOI to both PICs
+ * 
+ * @param regs Pointer to saved CPU register state
+ */
 void InterruptManager::handleInterrupt(registers_t* regs) {
     // Handle specific interrupts
     if (regs->int_no == 32) {
diff --git a/kernel/src/main.cpp b/kernel/src/main.cpp
index cf7e071..d630a93 100644
--- a/kernel/src/main.cpp
+++ b/kernel/src/main.cpp
@@ -1,3 +1,26 @@
+/**
+ * @file main.cpp
+ * @brief MetalOS Kernel Main Entry Point
+ * 
+ * This is the heart of MetalOS - an extremely minimalist kernel designed to run
+ * a single application (QT6 Hello World). The kernel provides only the essential
+ * hardware initialization needed to run the application.
+ * 
+ * Design Philosophy:
+ * - No scheduler: Single application, always running
+ * - No process management: One process only
+ * - No complex memory management: Simple bump allocator
+ * - No filesystem: Application embedded in boot image
+ * - Multicore support: All cores initialized for future parallel processing
+ * 
+ * Boot sequence:
+ * 1. UEFI bootloader loads kernel and provides boot information
+ * 2. Kernel initializes hardware (GDT, IDT, memory, timer, PCI, SMP)
+ * 3. Kernel will eventually jump directly to the application
+ * 
+ * Target size: < 150 KB (achieved through extreme minimalism)
+ */
+
 /*
  * MetalOS Kernel - Main Entry Point
  * 
@@ -16,12 +39,60 @@
 #include "kernel/timer.h"
 #include "kernel/smp.h"
 
-/*
- * Kernel main entry point
- * Called by bootloader with boot information
+/**
+ * @brief Kernel main entry point - called by bootloader
  * 
- * This is the root-level function that hands off to C++ classes
- * for hardware initialization and system management.
+ * This is the first C++ function executed after the bootloader transfers control.
+ * It receives boot information from UEFI and performs minimal hardware initialization.
+ * 
+ * Initialization sequence:
+ * 
+ * 1. GDT (Global Descriptor Table):
+ *    - Required for x86-64 segmentation and privilege levels
+ *    - Sets up kernel/user code and data segments
+ * 
+ * 2. IDT (Interrupt Descriptor Table):
+ *    - Sets up interrupt and exception handlers
+ *    - Remaps PIC to avoid conflicts with CPU exceptions
+ *    - Enables hardware interrupts
+ * 
+ * 3. Physical Memory Manager:
+ *    - Initializes page bitmap for 4KB page allocation
+ *    - Currently assumes 128MB at 16MB physical address
+ *    - TODO: Parse UEFI memory map for proper detection
+ * 
+ * 4. Kernel Heap:
+ *    - Allocates 1MB (256 pages) for kernel dynamic allocation
+ *    - Uses simple bump allocator (no free() support)
+ * 
+ * 5. Timer (PIT):
+ *    - Programs 8254 PIT for 1000 Hz (1ms ticks)
+ *    - Used for timekeeping and delays
+ * 
+ * 6. PCI Bus:
+ *    - Enumerates all PCI devices
+ *    - Discovers GPU and other hardware
+ *    - Stores device information for later use
+ * 
+ * 7. SMP (Multi-Processing):
+ *    - Initializes Local APIC on BSP
+ *    - Starts all available Application Processor cores
+ *    - Currently APs idle; only BSP runs application
+ * 
+ * After initialization, the kernel will eventually:
+ * - Initialize GPU for framebuffer graphics
+ * - Set up minimal input (PS/2 keyboard/mouse or USB)
+ * - Jump directly to QT6 application entry point
+ * 
+ * For now, it enters an infinite halt loop waiting for implementation.
+ * 
+ * @param boot_info Pointer to boot information structure from UEFI bootloader containing:
+ *                  - Framebuffer information (base, width, height, pitch, bpp)
+ *                  - Kernel location and size
+ *                  - ACPI RSDP pointer
+ *                  - UEFI memory map
+ * 
+ * @note This function should never return
  */
 extern "C" void kernel_main(BootInfo* boot_info) {
     // Initialize GDT (Global Descriptor Table) - using GDT class
diff --git a/kernel/src/memory.cpp b/kernel/src/memory.cpp
index e500827..89ff90f 100644
--- a/kernel/src/memory.cpp
+++ b/kernel/src/memory.cpp
@@ -1,8 +1,10 @@
-/*
- * MetalOS Kernel - Memory Management
+/**
+ * @file memory.cpp
+ * @brief Implementation of physical memory manager and kernel heap allocator
  * 
- * Simple physical memory manager and heap allocator
- * Minimal implementation for single-app OS
+ * Provides two memory management subsystems:
+ * 1. Physical Memory Manager (PMM): Manages 4KB pages using a bitmap
+ * 2. Heap Allocator: Simple bump allocator for kernel dynamic allocation
  */
 
 #include "kernel/memory.h"
@@ -10,7 +12,11 @@
 // Physical memory bitmap constants
 #define BITMAP_SIZE 32768  // Supports up to 128MB with 4KB pages
 
-// PhysicalMemoryManager class implementation
+/* PhysicalMemoryManager class implementation */
+
+/**
+ * @brief Constructor - initializes all fields and clears bitmap
+ */
 PhysicalMemoryManager::PhysicalMemoryManager() 
     : totalPages(0), usedPages(0) {
     for (uint64_t i = 0; i < BITMAP_SIZE; i++) {
@@ -18,6 +24,20 @@ PhysicalMemoryManager::PhysicalMemoryManager()
     }
 }
 
+/**
+ * @brief Initialize the physical memory manager
+ * 
+ * Currently uses a simplified approach:
+ * - Assumes 128MB of usable RAM starting at physical address 16MB (0x01000000)
+ * - Clears the entire page bitmap to mark all pages as free
+ * - TODO: Parse the UEFI memory map from bootInfo to properly detect available memory
+ * 
+ * The 16MB starting address is chosen to avoid:
+ * - First 1MB: Legacy BIOS area, video memory, etc.
+ * - 1MB-16MB: Kernel code, boot structures, and reserved areas
+ * 
+ * @param bootInfo Boot information structure (currently unused, TODO: parse memory map)
+ */
 void PhysicalMemoryManager::init(BootInfo* bootInfo) {
     (void)bootInfo;  // TODO: Parse UEFI memory map
     
@@ -32,6 +52,21 @@ void PhysicalMemoryManager::init(BootInfo* bootInfo) {
     usedPages = 0;
 }
 
+/**
+ * @brief Allocate a single 4KB physical memory page
+ * 
+ * Uses a simple first-fit algorithm:
+ * 1. Scan the bitmap from the beginning
+ * 2. Find the first page where the corresponding bit is 0 (free)
+ * 3. Set the bit to 1 (allocated)
+ * 4. Calculate and return the physical address
+ * 
+ * Each bit in the bitmap represents one 4KB page:
+ * - Byte N, Bit M represents page (N*8 + M)
+ * - Physical address = 0x01000000 + (page_index * 4096)
+ * 
+ * @return Physical address of allocated page, or nullptr if out of memory
+ */
 void* PhysicalMemoryManager::allocPage() {
     // Find first free page in bitmap
     for (uint64_t i = 0; i < totalPages; i++) {
@@ -53,6 +88,15 @@ void* PhysicalMemoryManager::allocPage() {
     return nullptr;
 }
 
+/**
+ * @brief Free a previously allocated physical memory page
+ * 
+ * Calculates the page index from the physical address and clears the
+ * corresponding bit in the bitmap to mark the page as free.
+ * 
+ * @param page Physical address of page to free
+ * @note Does nothing if address is invalid (< base or >= limit)
+ */
 void PhysicalMemoryManager::freePage(void* page) {
     uint64_t addr = (uint64_t)page;
     
@@ -71,24 +115,65 @@ void PhysicalMemoryManager::freePage(void* page) {
     usedPages--;
 }
 
+/**
+ * @brief Get total memory managed by PMM in bytes
+ * @return Total memory size (totalPages * PAGE_SIZE)
+ */
 uint64_t PhysicalMemoryManager::getTotalMemory() const {
     return totalPages * PAGE_SIZE;
 }
 
+/**
+ * @brief Get free memory available in bytes
+ * @return Free memory size ((totalPages - usedPages) * PAGE_SIZE)
+ */
 uint64_t PhysicalMemoryManager::getFreeMemory() const {
     return (totalPages - usedPages) * PAGE_SIZE;
 }
 
-// HeapAllocator class implementation
+/* HeapAllocator class implementation */
+
+/**
+ * @brief Constructor - initializes all pointers to null
+ */
 HeapAllocator::HeapAllocator() 
     : heapStart(nullptr), heapCurrent(nullptr), heapEnd(nullptr) {}
 
+/**
+ * @brief Initialize heap with a pre-allocated memory region
+ * 
+ * The heap operates on a contiguous region of memory. The heapCurrent pointer
+ * starts at the beginning and moves forward with each allocation.
+ * 
+ * @param start Starting address of heap region (obtained from PMM)
+ * @param size Size of heap region in bytes (e.g., 1MB = 256 pages * 4KB)
+ */
 void HeapAllocator::init(void* start, size_t size) {
     heapStart = (uint8_t*)start;
     heapCurrent = heapStart;
     heapEnd = heapStart + size;
 }
 
+/**
+ * @brief Allocate memory from the heap (bump allocator)
+ * 
+ * This is a "bump" or "arena" allocator - the simplest possible allocator.
+ * It just moves the current pointer forward by the requested size.
+ * 
+ * Process:
+ * 1. Round size up to 16-byte boundary for alignment
+ * 2. Check if enough space remains in heap
+ * 3. Save current pointer as return value
+ * 4. Move current pointer forward by aligned size
+ * 
+ * Alignment to 16 bytes ensures:
+ * - Compatibility with SSE/AVX instructions (require 16-byte alignment)
+ * - Better cache line utilization
+ * - Prevents unaligned access penalties
+ * 
+ * @param size Number of bytes to allocate
+ * @return Pointer to allocated memory, or nullptr if out of heap space
+ */
 void* HeapAllocator::alloc(size_t size) {
     if (!heapStart) {
         return nullptr;
@@ -107,6 +192,15 @@ void* HeapAllocator::alloc(size_t size) {
     return ptr;
 }
 
+/**
+ * @brief Allocate and zero-initialize array memory
+ * 
+ * Equivalent to alloc(num * size) followed by memset to zero.
+ * 
+ * @param num Number of elements
+ * @param size Size of each element in bytes
+ * @return Pointer to allocated and zeroed memory, or nullptr if out of space
+ */
 void* HeapAllocator::calloc(size_t num, size_t size) {
     size_t total = num * size;
     void* ptr = alloc(total);
@@ -118,13 +212,35 @@ void* HeapAllocator::calloc(size_t num, size_t size) {
     return ptr;
 }
 
+/**
+ * @brief Free memory (no-op in bump allocator)
+ * 
+ * Bump allocators cannot free individual allocations. The entire heap
+ * can only be reset at once. For a simple single-application OS, this
+ * limitation is acceptable.
+ * 
+ * @param ptr Pointer to memory (ignored)
+ * @todo Replace with proper allocator if individual free() is needed
+ */
 void HeapAllocator::free(void* ptr) {
     (void)ptr;
     // TODO: Implement proper free with a real allocator
     // For now, bump allocator doesn't support freeing
 }
 
-// Memory utility functions
+/* Memory utility functions */
+
+/**
+ * @brief Fill memory with a constant byte value
+ * 
+ * Simple byte-by-byte memset implementation. Not optimized for large blocks,
+ * but sufficient for kernel use with small structures and buffers.
+ * 
+ * @param dest Pointer to memory block to fill
+ * @param val Value to set (converted to unsigned char)
+ * @param count Number of bytes to set
+ * @return Pointer to dest
+ */
 void* memset(void* dest, int val, size_t count) {
     uint8_t* d = (uint8_t*)dest;
     uint8_t v = (uint8_t)val;
@@ -136,6 +252,17 @@ void* memset(void* dest, int val, size_t count) {
     return dest;
 }
 
+/**
+ * @brief Copy memory from source to destination
+ * 
+ * Simple byte-by-byte memcpy implementation. Memory areas must not overlap.
+ * 
+ * @param dest Pointer to destination buffer
+ * @param src Pointer to source buffer
+ * @param count Number of bytes to copy
+ * @return Pointer to dest
+ * @warning Memory regions must not overlap (use memmove if they might)
+ */
 void* memcpy(void* dest, const void* src, size_t count) {
     uint8_t* d = (uint8_t*)dest;
     const uint8_t* s = (const uint8_t*)src;
@@ -147,6 +274,17 @@ void* memcpy(void* dest, const void* src, size_t count) {
     return dest;
 }
 
+/**
+ * @brief Compare two memory blocks
+ * 
+ * Compares memory byte-by-byte until a difference is found or count bytes
+ * have been compared.
+ * 
+ * @param s1 Pointer to first memory block
+ * @param s2 Pointer to second memory block
+ * @param count Number of bytes to compare
+ * @return 0 if equal, negative if s1 < s2, positive if s1 > s2
+ */
 int memcmp(const void* s1, const void* s2, size_t count) {
     const uint8_t* a = (const uint8_t*)s1;
     const uint8_t* b = (const uint8_t*)s2;
diff --git a/kernel/src/pci.cpp b/kernel/src/pci.cpp
index 3f96bcc..6c38a2a 100644
--- a/kernel/src/pci.cpp
+++ b/kernel/src/pci.cpp
@@ -1,27 +1,69 @@
-/*
- * MetalOS Kernel - PCI Bus Support
+/**
+ * @file pci.cpp
+ * @brief Implementation of PCI bus enumeration and device management
  * 
- * Minimal PCI enumeration and configuration
- * Only what's needed to find and initialize the GPU
+ * PCI (Peripheral Component Interconnect) is the standard bus for connecting
+ * hardware devices. This implementation scans the PCI bus to discover devices
+ * and provides functions to configure them.
  */
 
 #include "kernel/pci.h"
 #include "kernel/memory.h"
 
-// I/O port access functions
+/**
+ * @brief Write a 32-bit value to an I/O port
+ * @param port I/O port address
+ * @param value 32-bit value to write
+ */
 static inline void outl(uint16_t port, uint32_t value) {
     __asm__ volatile("outl %0, %1" : : "a"(value), "Nd"(port));
 }
 
+/**
+ * @brief Read a 32-bit value from an I/O port
+ * @param port I/O port address
+ * @return 32-bit value read from port
+ */
 static inline uint32_t inl(uint16_t port) {
     uint32_t value;
     __asm__ volatile("inl %1, %0" : "=a"(value) : "Nd"(port));
     return value;
 }
 
-// PCIManager class implementation
+/* PCIManager class implementation */
+
+/**
+ * @brief Constructor - initializes device count to zero
+ */
 PCIManager::PCIManager() : deviceCount(0) {}
 
+/**
+ * @brief Read a 32-bit value from PCI configuration space
+ * 
+ * PCI configuration space is accessed through two I/O ports:
+ * - 0xCF8 (CONFIG_ADDRESS): Write the address of config register to read
+ * - 0xCFC (CONFIG_DATA): Read the 32-bit value from that register
+ * 
+ * The address format (32 bits):
+ * - Bit 31: Enable bit (must be 1)
+ * - Bits 16-23: Bus number (0-255)
+ * - Bits 11-15: Device number (0-31)
+ * - Bits 8-10: Function number (0-7)
+ * - Bits 0-7: Register offset (4-byte aligned, bits 0-1 ignored)
+ * 
+ * Each PCI device has 256 bytes of configuration space containing:
+ * - Device identification (vendor/device ID at offset 0x00)
+ * - Command/status registers (offset 0x04)
+ * - Class code (offset 0x08)
+ * - BARs (Base Address Registers at offsets 0x10-0x24)
+ * - Interrupt configuration
+ * 
+ * @param bus Bus number (0-255)
+ * @param device Device number on bus (0-31)
+ * @param function Function number within device (0-7)
+ * @param offset Register offset (must be 4-byte aligned)
+ * @return 32-bit configuration register value
+ */
 uint32_t PCIManager::readConfig(uint8_t bus, uint8_t device, uint8_t function, uint8_t offset) {
     uint32_t address = (uint32_t)(
         ((uint32_t)bus << 16) |
@@ -35,6 +77,21 @@ uint32_t PCIManager::readConfig(uint8_t bus, uint8_t device, uint8_t function, u
     return inl(PCI_CONFIG_DATA);
 }
 
+/**
+ * @brief Write a 32-bit value to PCI configuration space
+ * 
+ * Similar to readConfig, but writes a value to the specified register.
+ * Used for device configuration, such as:
+ * - Enabling bus mastering
+ * - Enabling memory/IO space access
+ * - Configuring interrupt lines
+ * 
+ * @param bus Bus number
+ * @param device Device number
+ * @param function Function number
+ * @param offset Register offset (4-byte aligned)
+ * @param value 32-bit value to write
+ */
 void PCIManager::writeConfig(uint8_t bus, uint8_t device, uint8_t function, uint8_t offset, uint32_t value) {
     uint32_t address = (uint32_t)(
         ((uint32_t)bus << 16) |
@@ -48,6 +105,26 @@ void PCIManager::writeConfig(uint8_t bus, uint8_t device, uint8_t function, uint
     outl(PCI_CONFIG_DATA, value);
 }
 
+/**
+ * @brief Probe a specific PCI device/function and add to device list
+ * 
+ * Reads device information from PCI configuration space and stores it:
+ * - Vendor ID and Device ID (for identification)
+ * - Class code, subclass, prog_if (device type)
+ * - Revision ID
+ * - All 6 Base Address Registers (BARs)
+ * 
+ * BARs specify memory or I/O regions used by the device:
+ * - Bit 0: 0=memory BAR, 1=I/O BAR
+ * - For memory BARs:
+ *   - Bits 1-2: Type (00=32-bit, 10=64-bit)
+ *   - Bit 3: Prefetchable
+ *   - Bits 4-31: Base address (4KB aligned)
+ * 
+ * @param bus Bus number
+ * @param device Device number
+ * @param function Function number
+ */
 void PCIManager::probeDevice(uint8_t bus, uint8_t device, uint8_t function) {
     uint32_t vendorDevice = readConfig(bus, device, function, 0x00);
     uint16_t vendor_id = vendorDevice & 0xFFFF;
@@ -85,6 +162,21 @@ void PCIManager::probeDevice(uint8_t bus, uint8_t device, uint8_t function) {
     }
 }
 
+/**
+ * @brief Initialize PCI subsystem by scanning all buses
+ * 
+ * Performs a complete scan of the PCI bus hierarchy:
+ * - Iterates through all 256 possible buses
+ * - For each bus, checks all 32 device slots
+ * - For each device, checks if it's multi-function
+ * - If multi-function, scans all 8 functions
+ * 
+ * A device exists if its vendor ID is not 0xFFFF. The header type
+ * register (offset 0x0C) has bit 7 set for multi-function devices.
+ * 
+ * This approach is brute-force but simple and reliable. More sophisticated
+ * implementations would parse ACPI tables to find PCI buses.
+ */
 void PCIManager::init() {
     deviceCount = 0;
     
@@ -114,6 +206,18 @@ void PCIManager::init() {
     }
 }
 
+/**
+ * @brief Find a PCI device by vendor and device ID
+ * 
+ * Searches through the list of discovered devices for a match.
+ * Useful for finding specific hardware, e.g.:
+ * - AMD RX 6600: vendor=0x1002, device=0x73FF
+ * - Intel NIC: vendor=0x8086, device=various
+ * 
+ * @param vendor_id Vendor identifier (e.g., 0x1002 for AMD)
+ * @param device_id Device identifier (specific model)
+ * @return Pointer to PCIDevice if found, nullptr otherwise
+ */
 PCIDevice* PCIManager::findDevice(uint16_t vendor_id, uint16_t device_id) {
     for (uint32_t i = 0; i < deviceCount; i++) {
         if (devices[i].vendor_id == vendor_id && devices[i].device_id == device_id) {
@@ -123,6 +227,21 @@ PCIDevice* PCIManager::findDevice(uint16_t vendor_id, uint16_t device_id) {
     return nullptr;
 }
 
+/**
+ * @brief Enable bus mastering for a PCI device
+ * 
+ * Bus mastering allows a device to perform DMA (Direct Memory Access) -
+ * reading and writing system memory without CPU involvement. This is
+ * essential for high-performance devices like GPUs and network cards.
+ * 
+ * The command register (offset 0x04) contains control bits:
+ * - Bit 0: I/O Space Enable
+ * - Bit 1: Memory Space Enable
+ * - Bit 2: Bus Master Enable ← We set this bit
+ * - Bit 10: Interrupt Disable
+ * 
+ * @param dev Pointer to PCI device structure
+ */
 void PCIManager::enableBusMastering(PCIDevice* dev) {
     if (!dev) return;
     
diff --git a/kernel/src/smp.cpp b/kernel/src/smp.cpp
index f3e9e40..5c76bb8 100644
--- a/kernel/src/smp.cpp
+++ b/kernel/src/smp.cpp
@@ -1,8 +1,9 @@
-/*
- * MetalOS Kernel - SMP (Symmetric Multi-Processing) Support
+/**
+ * @file smp.cpp
+ * @brief Implementation of SMP (Symmetric Multi-Processing) initialization
  * 
- * Basic multicore support for better performance
- * Initializes Application Processors (APs) using SIPI protocol
+ * SMP support allows the OS to use multiple CPU cores. This involves starting
+ * Application Processors (APs) using the INIT-SIPI-SIPI sequence defined by Intel.
  */
 
 #include "kernel/smp.h"
@@ -21,7 +22,14 @@ extern "C" {
     void ap_trampoline_end(void);
 }
 
-// SMPManager class implementation
+/* SMPManager class implementation */
+
+/**
+ * @brief Constructor - initializes BSP (Bootstrap Processor) as CPU 0
+ * 
+ * The BSP is the first CPU core that starts when the system boots.
+ * It's responsible for initializing the system and starting other cores (APs).
+ */
 SMPManager::SMPManager() : cpuCount(1), smpEnabled(false) {
     // Initialize BSP
     cpuInfo[0].cpu_id = BSP_CPU_ID;
@@ -30,6 +38,15 @@ SMPManager::SMPManager() : cpuCount(1), smpEnabled(false) {
     cpuInfo[0].kernel_stack = 0;
 }
 
+/**
+ * @brief Get the logical CPU ID of the currently executing core
+ * 
+ * Uses the Local APIC ID to determine which CPU is running this code.
+ * This is important in multicore systems where each core may be executing
+ * kernel code simultaneously.
+ * 
+ * @return Logical CPU ID (0 for BSP, 1+ for APs)
+ */
 uint8_t SMPManager::getCurrentCPU() const {
     if (!smpEnabled) {
         return BSP_CPU_ID;
@@ -47,6 +64,16 @@ uint8_t SMPManager::getCurrentCPU() const {
     return BSP_CPU_ID;
 }
 
+/**
+ * @brief Initialize CPU information structure
+ * 
+ * Sets up the per-CPU data structure with initial values.
+ * Each CPU has a logical ID (sequential: 0, 1, 2...) and a physical
+ * APIC ID (may not be sequential, e.g., 0, 2, 4, 6...).
+ * 
+ * @param cpuId Logical CPU ID (0-15)
+ * @param apicId Physical APIC ID
+ */
 void SMPManager::initCPU(uint8_t cpuId, uint8_t apicId) {
     if (cpuId >= MAX_CPUS) return;
     
@@ -62,6 +89,17 @@ void SMPManager::markCPUOnline(uint8_t cpuId) {
     }
 }
 
+/**
+ * @brief Busy-wait delay for timing during AP startup
+ * 
+ * This is an approximate delay using a busy loop. Not precise, but sufficient
+ * for the timing requirements of the INIT-SIPI-SIPI sequence:
+ * - 10ms delay after INIT
+ * - 200μs delay after each SIPI
+ * 
+ * @param microseconds Delay duration in microseconds (approximate)
+ * @note Uses PAUSE instruction to improve performance during busy-wait
+ */
 void SMPManager::delay(uint32_t microseconds) {
     // Approximate delay (not precise)
     for (volatile uint32_t i = 0; i < microseconds * 100; i++) {
@@ -69,6 +107,30 @@ void SMPManager::delay(uint32_t microseconds) {
     }
 }
 
+/**
+ * @brief Start an Application Processor using INIT-SIPI-SIPI sequence
+ * 
+ * The Intel-specified AP startup sequence:
+ * 1. Send INIT IPI to reset the AP to a known state (16-bit real mode)
+ * 2. Wait 10ms for INIT to complete
+ * 3. Send first SIPI with vector = page number of trampoline code
+ * 4. Wait 200μs
+ * 5. Send second SIPI (per Intel spec for compatibility)
+ * 6. Wait 200μs
+ * 7. Poll for AP to mark itself online (timeout after 1 second)
+ * 
+ * The SIPI vector is the page number (4KB) where the trampoline code
+ * is located. For address 0x8000, vector = 0x8000 >> 12 = 0x08.
+ * 
+ * The trampoline code must:
+ * - Be in low memory (< 1MB) accessible in real mode
+ * - Switch from 16-bit real mode to 64-bit long mode
+ * - Initialize the AP's GDT, IDT, and APIC
+ * - Jump to the AP entry point in the kernel
+ * 
+ * @param apicId Physical APIC ID of the AP to start
+ * @return true if AP started successfully, false on timeout
+ */
 bool SMPManager::startAP(uint8_t apicId) {
     // Send INIT IPI
     apic_send_ipi(apicId, 0, APIC_IPI_INIT);
@@ -97,6 +159,28 @@ bool SMPManager::startAP(uint8_t apicId) {
     return false;
 }
 
+/**
+ * @brief Initialize SMP subsystem and start all available CPU cores
+ * 
+ * This function performs the following steps:
+ * 1. Check if Local APIC is available (required for SMP)
+ * 2. If no APIC, fall back to single-core mode
+ * 3. Initialize BSP's Local APIC
+ * 4. Get BSP's APIC ID
+ * 5. Attempt to start additional cores by sending IPIs
+ * 6. Set smpEnabled flag if multiple cores detected
+ * 
+ * The function tries to start up to MAX_CPUS cores by probing APIC IDs
+ * from 0 to maxCPUsToTry. In a real system, this should be done by
+ * parsing the ACPI MADT (Multiple APIC Description Table) to find
+ * the actual APIC IDs of installed CPUs.
+ * 
+ * After successful initialization:
+ * - All cores have initialized their Local APICs
+ * - All cores are marked as online
+ * - Each core can execute kernel code
+ * - Currently only BSP runs the application (APs idle)
+ */
 void SMPManager::init() {
     // Check if APIC is available
     if (!apic_is_available()) {
diff --git a/kernel/src/spinlock.cpp b/kernel/src/spinlock.cpp
index da8817a..8428140 100644
--- a/kernel/src/spinlock.cpp
+++ b/kernel/src/spinlock.cpp
@@ -1,20 +1,42 @@
-/*
- * MetalOS Kernel - Spinlock
+/**
+ * @file spinlock.cpp
+ * @brief Implementation of spinlock synchronization primitive
  * 
- * Simple spinlock implementation for multicore synchronization
- * Uses x86 atomic instructions
+ * Spinlocks provide mutual exclusion in multicore systems using atomic operations.
  */
 
 #include "kernel/spinlock.h"
 
-// Spinlock class implementation
+/* Spinlock class implementation */
 
+/**
+ * @brief Constructor - initializes lock to unlocked state (0)
+ */
 Spinlock::Spinlock() : lock(0) {}
 
 void Spinlock::init() {
     lock = 0;
 }
 
+/**
+ * @brief Acquire the spinlock (block until available)
+ * 
+ * This function uses the x86 XCHG (exchange) instruction, which is:
+ * - Atomic: The operation cannot be interrupted midway
+ * - Implicitly locked: Works correctly across multiple CPU cores
+ * - Sequentially consistent: No memory reordering issues
+ * 
+ * The algorithm:
+ * 1. Atomically exchange the lock variable with 1
+ * 2. If the old value was 0, we got the lock (return)
+ * 3. If the old value was 1, lock was already held (spin)
+ * 4. Use PAUSE instruction while spinning to improve performance
+ * 
+ * The PAUSE instruction:
+ * - Improves performance on hyperthreaded CPUs
+ * - Reduces power consumption during spin-wait
+ * - Prevents memory order violations in the spin loop
+ */
 void Spinlock::acquire() {
     while (1) {
         // Try to acquire lock using atomic exchange
@@ -36,6 +58,15 @@ void Spinlock::acquire() {
     }
 }
 
+/**
+ * @brief Try to acquire the spinlock without blocking
+ * 
+ * Similar to acquire(), but returns immediately if lock is already held.
+ * Useful when you want to try acquiring a lock but have alternative work
+ * to do if it's not available.
+ * 
+ * @return true if lock was successfully acquired, false if already locked
+ */
 bool Spinlock::tryAcquire() {
     uint32_t old_value;
     __asm__ volatile(
@@ -48,6 +79,18 @@ bool Spinlock::tryAcquire() {
     return (old_value == 0);
 }
 
+/**
+ * @brief Release the spinlock
+ * 
+ * Simply sets the lock variable back to 0 (unlocked). The empty inline assembly
+ * with "memory" clobber acts as a compiler memory barrier, ensuring all previous
+ * stores are completed before the lock is released.
+ * 
+ * This prevents the compiler from reordering memory operations across the lock
+ * boundary, which would violate the mutual exclusion guarantee.
+ * 
+ * @note Must only be called by the CPU that currently holds the lock
+ */
 void Spinlock::release() {
     // Memory barrier to ensure all previous stores are visible
     __asm__ volatile("" ::: "memory");
diff --git a/kernel/src/timer.cpp b/kernel/src/timer.cpp
index 80d3596..5a332b4 100644
--- a/kernel/src/timer.cpp
+++ b/kernel/src/timer.cpp
@@ -1,8 +1,8 @@
-/*
- * MetalOS Kernel - Timer Support
+/**
+ * @file timer.cpp
+ * @brief Implementation of PIT (Programmable Interval Timer) manager
  * 
- * Simple PIT (Programmable Interval Timer) support
- * Used for scheduling and timing
+ * The PIT generates periodic timer interrupts for system timekeeping and delays.
  */
 
 #include "kernel/timer.h"
@@ -15,20 +15,64 @@
 // PIT constants
 #define PIT_BASE_FREQUENCY 1193182  // Hz
 
-// I/O port access functions
+/**
+ * @brief Write a byte to an I/O port
+ * 
+ * Uses the x86 OUT instruction to write a byte to a hardware I/O port.
+ * 
+ * @param port I/O port address
+ * @param value Byte value to write
+ */
 static inline void outb(uint16_t port, uint8_t value) {
     __asm__ volatile("outb %0, %1" : : "a"(value), "Nd"(port));
 }
 
+/**
+ * @brief Read a byte from an I/O port
+ * 
+ * Uses the x86 IN instruction to read a byte from a hardware I/O port.
+ * 
+ * @param port I/O port address
+ * @return Byte value read from port
+ */
 static inline uint8_t inb(uint16_t port) {
     uint8_t value;
     __asm__ volatile("inb %1, %0" : "=a"(value) : "Nd"(port));
     return value;
 }
 
-// Timer class implementation
+/* Timer class implementation */
+
+/**
+ * @brief Constructor - initializes tick counter to zero
+ */
 Timer::Timer() : ticks(0) {}
 
+/**
+ * @brief Initialize the PIT to generate interrupts at specified frequency
+ * 
+ * The PIT works by counting down from a divisor value at its base frequency
+ * of 1.193182 MHz. When the counter reaches zero, it generates an interrupt
+ * and reloads the divisor.
+ * 
+ * For example, to get 1000 Hz (1ms ticks):
+ * divisor = 1193182 / 1000 = 1193
+ * 
+ * The process:
+ * 1. Calculate divisor from desired frequency
+ * 2. Send command byte to configure channel 0 in rate generator mode
+ * 3. Send low byte of divisor
+ * 4. Send high byte of divisor
+ * 5. Unmask IRQ0 in the PIC to enable timer interrupts
+ * 
+ * Command byte 0x36 means:
+ * - Channel 0
+ * - Access mode: lobyte/hibyte
+ * - Mode 3: Square wave generator (rate generator)
+ * - Binary counter (not BCD)
+ * 
+ * @param frequency Desired interrupt frequency in Hz (e.g., 1000 for 1ms ticks)
+ */
 void Timer::init(uint32_t frequency) {
     // Calculate divisor
     uint32_t divisor = PIT_BASE_FREQUENCY / frequency;
@@ -53,6 +97,16 @@ uint64_t Timer::getTicks() const {
     return ticks;
 }
 
+/**
+ * @brief Wait for a specified number of timer ticks
+ * 
+ * Calculates target tick count and uses HLT instruction to wait efficiently.
+ * HLT puts the CPU in a low-power state until the next interrupt arrives.
+ * 
+ * @param waitTicks Number of ticks to wait
+ * @note Blocking function - CPU will be idle during wait
+ * @note At 1000 Hz, each tick is 1 millisecond
+ */
 void Timer::wait(uint32_t waitTicks) const {
     uint64_t target = ticks + waitTicks;
     while (ticks < target) {
@@ -60,6 +114,14 @@ void Timer::wait(uint32_t waitTicks) const {
     }
 }
 
+/**
+ * @brief Handle timer interrupt (increment tick counter)
+ * 
+ * This function is called from the IRQ0 interrupt handler every time
+ * the PIT generates an interrupt. It simply increments the tick counter.
+ * 
+ * @note Must be called from interrupt context only
+ */
 void Timer::handleInterrupt() {
     ticks++;
 }