diff --git a/docs/manual.md b/docs/manual.md index a7612c04d..4c35fe059 100644 --- a/docs/manual.md +++ b/docs/manual.md @@ -1817,6 +1817,7 @@ At build-time, the Microkit tool embeds the capDL specification that describe all kernel objects that needs to be created. Then for each kernel object, the spec describe what state they need to be in and what capabilities exist to that object (i.e. who has access to this kernel object). For example, the spec would specify the: + - starting Instruction Pointer (IP), Stack Pointer (SP) and IPC buffer pointer of a Thread Control Block (TCB), - page table structure and mapping attributes of an address space (VSpace), - interrupts (IRQ), @@ -1868,7 +1869,7 @@ In order to do this however, the Microkit tool needs to emulate how the seL4 ker to obtain the list of free untyped objects that the kernel would give to the initial task. While this is non-trivial to do, it comes with the useful property that if the tool -produces a valid image, there should be no errors upon initialising the system +produces a valid image, there should be no errors upon initialising the system. If there are any errors with configuring the system (e.g running out of memory), they will be caught at build-time. This can only reasonably be done due to the static-architecture of Microkit systems. diff --git a/loader/src/aarch64/mmu.c b/loader/src/aarch64/mmu.c index 8ef6427ce..39b4676c8 100644 --- a/loader/src/aarch64/mmu.c +++ b/loader/src/aarch64/mmu.c @@ -12,18 +12,13 @@ #include "../cutil.h" #include "../uart.h" -void el1_mmu_enable(void); -void el2_mmu_enable(void); +void el1_mmu_enable(uint64_t aarch64_pt_ttbr0_el1, uint64_t aarch64_pt_ttbr1_el1); +void el2_mmu_enable(uint64_t aarch64_pt_ttbr0_el2); -/* Paging structures for kernel mapping */ -uint64_t boot_lvl0_upper[1 << 9] ALIGN(1 << 12); -uint64_t boot_lvl1_upper[1 << 9] ALIGN(1 << 12); -uint64_t boot_lvl2_upper[1 << 9] ALIGN(1 << 12); - -/* Paging structures for identity mapping */ -uint64_t boot_lvl0_lower[1 << 9] ALIGN(1 << 12); -uint64_t boot_lvl1_lower[1 << 9] ALIGN(1 << 12); -uint64_t boot_lvl2_lower[1 << 9] ALIGN(1 << 12); +/* Pointers to the top-level paging structures */ +uint64_t aarch64_pt_ttbr0_el1; +uint64_t aarch64_pt_ttbr1_el1; +uint64_t aarch64_pt_ttbr0_el2; int arch_mmu_enable(int logical_cpu) { @@ -37,9 +32,9 @@ int arch_mmu_enable(int logical_cpu) LDR_PRINT("INFO", logical_cpu, "enabling MMU\n"); el = current_el(); if (el == EL1) { - el1_mmu_enable(); + el1_mmu_enable(aarch64_pt_ttbr0_el1, aarch64_pt_ttbr1_el1); } else if (el == EL2) { - el2_mmu_enable(); + el2_mmu_enable(aarch64_pt_ttbr0_el2); } else { LDR_PRINT("ERROR", logical_cpu, "unknown EL for MMU enable\n"); } diff --git a/loader/src/aarch64/util64.S b/loader/src/aarch64/util64.S index ccf1889b0..fd94ebe36 100644 --- a/loader/src/aarch64/util64.S +++ b/loader/src/aarch64/util64.S @@ -308,7 +308,6 @@ END_FUNC(el1_mmu_disable) BEGIN_FUNC(el2_mmu_disable) stp x29, x30, [sp, #-16]! - stp x27, x28, [sp, #-16]! mov x29, sp /* Disable caches */ @@ -323,14 +322,18 @@ BEGIN_FUNC(el2_mmu_disable) */ bl invalidate_icache - ldp x27, x28, [sp], #16 ldp x29, x30, [sp], #16 ret END_FUNC(el2_mmu_disable) +/* + * Enables the MMU for EL2. + * Takes two arguments the physical address for TTBR0_EL1 (x0) and TTBR1_EL1 (x1). + */ BEGIN_FUNC(el1_mmu_enable) stp x29, x30, [sp, #-16]! stp x27, x28, [sp, #-16]! + /* move caller-saved to callee-saved registers */ mov x29, sp mov x27, x0 mov x28, x1 @@ -358,10 +361,8 @@ BEGIN_FUNC(el1_mmu_enable) msr tcr_el1, x10 /* Setup page tables */ - adrp x8, boot_lvl0_lower - msr ttbr0_el1, x8 - adrp x8, boot_lvl0_upper - msr ttbr1_el1, x8 + msr ttbr0_el1, x27 /* argument 0 */ + msr ttbr1_el1, x28 /* argument 1 */ isb /* invalidate all TLB entries for EL1 */ @@ -374,12 +375,18 @@ BEGIN_FUNC(el1_mmu_enable) ldp x27, x28, [sp], #16 ldp x29, x30, [sp], #16 ret - END_FUNC(el1_mmu_enable) +/* + * Enables the MMU for EL2. + * Takes one argument, the physical address for TTBR0_EL2 (x0). + */ BEGIN_FUNC(el2_mmu_enable) stp x29, x30, [sp, #-16]! + stp x27, x28, [sp, #-16]! + /* move caller-saved to callee-saved registers */ mov x29, sp + mov x28, x0 /* Disable the MMU */ bl el2_mmu_disable @@ -403,8 +410,7 @@ BEGIN_FUNC(el2_mmu_enable) isb /* Setup page tables */ - adrp x8, boot_lvl0_lower - msr ttbr0_el2, x8 + msr ttbr0_el2, x28 /* argument 0 */ isb /* invalidate all TLB entries for EL2 */ @@ -423,9 +429,9 @@ BEGIN_FUNC(el2_mmu_enable) dsb ish isb + ldp x27, x28, [sp], #16 ldp x29, x30, [sp], #16 ret - END_FUNC(el2_mmu_enable) .extern arm_secondary_cpu_c_entry diff --git a/loader/src/riscv/mmu.c b/loader/src/riscv/mmu.c index 7751b25e9..b40350ca7 100644 --- a/loader/src/riscv/mmu.c +++ b/loader/src/riscv/mmu.c @@ -8,15 +8,9 @@ #include #include "../arch.h" -#include "../cutil.h" - -/* Paging structures for kernel mapping */ -uint64_t boot_lvl1_pt[1 << 9] ALIGN(1 << 12); -uint64_t boot_lvl2_pt[1 << 9] ALIGN(1 << 12); -uint64_t boot_lvl3_pt[1 << 9] ALIGN(1 << 12); -/* Paging structures for identity mapping */ -uint64_t boot_lvl2_pt_loader[1 << 9] ALIGN(1 << 12); +/* Pointers to the top-level paging structures */ +uintptr_t riscv64_boot_lvl1_pt; /* * This is the encoding for the MODE field of the satp register when @@ -36,7 +30,7 @@ int arch_mmu_enable(int logical_cpu) asm volatile( "csrw satp, %0\n" : - : "r"(VM_MODE | (uintptr_t)boot_lvl1_pt >> RISCV_PGSHIFT) + : "r"(VM_MODE | riscv64_boot_lvl1_pt >> RISCV_PGSHIFT) : ); asm volatile("fence.i" ::: "memory"); diff --git a/tool/microkit/src/loader.rs b/tool/microkit/src/loader.rs index 5e4ec89b9..2f83c69e3 100644 --- a/tool/microkit/src/loader.rs +++ b/tool/microkit/src/loader.rs @@ -4,21 +4,52 @@ // SPDX-License-Identifier: BSD-2-Clause // use crate::elf::{ElfFile, ElfSegmentData}; -use crate::sel4::{Arch, Config}; +use crate::sel4::{Arch, Config, PlatformConfigRegion}; use crate::uimage::uimage_serialise; -use crate::util::{mb, round_up, struct_to_bytes}; +use crate::util::{align_down, mb, round_up, struct_to_bytes}; +use std::cmp::min; use std::fs::File; use std::io::{BufWriter, Write}; +use std::mem; use std::ops::Range; use std::path::Path; macro_rules! grab_symbol { - ($elf: expr, $symbol_name: literal) => { + ($elf: expr, $symbol_name: expr) => { $elf.find_symbol($symbol_name) .expect(concat!("Could not find '", $symbol_name, "' symbol")) }; } +// XX: This could be generic on arbitrary if we could specify T:: implements from_le_bytes, +// but we can't. +fn read_symbol_maybe(elf: &ElfFile, symbol_name: &str) -> Option { + let (addr, size) = elf.find_symbol(symbol_name).ok()?; + + let symbol_bytes = elf.get_data(addr, size)?; + + assert!(mem::size_of::() == symbol_bytes.len()); + + Some(u64::from_le_bytes(symbol_bytes.try_into().ok()?)) +} + +macro_rules! write_symbol { + ($loader_image: expr, $image_vaddr: expr, $elf: expr, $symbol: literal, $symbol_var: expr) => { + let (addr, size) = grab_symbol!($elf, $symbol); + let addr = usize::try_from(addr).expect("addr fits in usize"); + let size = usize::try_from(size).expect("size fits in usize"); + let image_vaddr = usize::try_from($image_vaddr).expect("vaddr fits in usize"); + + assert!(addr >= image_vaddr); + assert!(size == ::std::mem::size_of_val(&$symbol_var)); + + let offset: usize = (addr - image_vaddr); + assert!(offset <= $loader_image.len()); + + $loader_image[offset..(offset + size)].copy_from_slice(&$symbol_var.to_le_bytes()); + }; +} + const PAGE_TABLE_SIZE: usize = 4096; pub mod aarch64 { @@ -32,6 +63,7 @@ pub mod aarch64 { pub const LVL0_BITS: u64 = 9; pub const LVL1_BITS: u64 = 9; pub const LVL2_BITS: u64 = 9; + pub const LVL3_BITS: u64 = 9; pub fn lvl0_index(addr: u64) -> usize { let idx = (addr >> (BLOCK_BITS_2MB + LVL2_BITS + LVL1_BITS)) & mask(LVL0_BITS); @@ -48,6 +80,11 @@ pub mod aarch64 { idx as usize } + pub fn lvl3_index(addr: u64) -> usize { + let idx = (addr >> PAGE_BITS_4KB) & mask(LVL3_BITS); + idx as usize + } + /// Stage 1 translation table page/block descriptors have bits[4:2] containing /// AttrIndex[2:0]. The AttrIndex values depends on our configuration of /// the `MAIR_EL1` or `MAIR_EL2` registers done in util64.S; @@ -105,6 +142,11 @@ pub mod aarch64 { /// > and the level 2 descriptor n is 21. pub const BLOCK_BITS_2MB: u64 = 21; + // TODO: + + pub const BLOCK_BITS_512GB: u64 = 39; + pub const PAGE_BITS_4KB: u64 = 12; + /// Per "Table D8-52 Stage 1 VMSAv8-64 Block and Page descriptor fields" and /// "Figure D8-14 VMSAv8-64 Block descriptor formats" of ARM DDI0487L.b; /// specifically subfigure "4KB, 16KB, and 64KB granules, 48-bit OA" @@ -117,7 +159,7 @@ pub mod aarch64 { let shareability = if attr_index == s1_mair_attr_index::MT_NORMAL { // Match what the seL4 kernel uses for its page tables, which // is especially necessary for SMP booting which relies on it - // for coherency. + // for coherency. See the comment in seL4 `release_secondary_cpus()`. shareability_attributes::INNER_SHAREABLE } else { // Per $R_{PYFVQ}$: @@ -250,6 +292,7 @@ pub mod aarch64 { } mod riscv64 { + pub(crate) const BLOCK_BITS_1GB: u64 = 30; pub(crate) const BLOCK_BITS_2MB: u64 = 21; pub(crate) const PAGE_BITS_4K: u64 = 12; @@ -289,18 +332,18 @@ mod riscv64 { /// Checks that each region in the given list does not overlap with any other region. /// Panics upon finding an overlapping region -fn check_non_overlapping(regions: &Vec<(u64, &[u8])>) { +fn check_non_overlapping(regions: &Vec<(u64, u64)>) { let mut checked: Vec<(u64, u64)> = Vec::new(); - for (base, data) in regions { - let end = base + data.len() as u64; + for &(base, size) in regions.iter() { + let end = base + size; // Check that this does not overlap with any checked regions - for (b, e) in &checked { - if !(end <= *b || *base >= *e) { + for &(b, e) in checked.iter() { + if !(end <= b || base >= e) { panic!("Overlapping regions: [{base:x}..{end:x}) overlaps [{b:x}..{e:x})"); } } - checked.push((*base, end)); + checked.push((base, end)); } } @@ -330,6 +373,7 @@ pub struct Loader<'a> { header: LoaderHeader64, region_metadata: Vec, regions: Vec<(u64, &'a [u8])>, + page_table_bytes: Vec, word_size: usize, elf_machine: u16, entry: u64, @@ -427,28 +471,15 @@ impl<'a> Loader<'a> { panic!("INTERNAL: could not determine kernel_first_paddr"); }; - let pagetable_vars = match config.arch { - Arch::Aarch64 => Loader::aarch64_setup_pagetables( - config, - &loader_elf, - kernel_first_vaddr, - kernel_first_paddr, - ), - Arch::Riscv64 => Loader::riscv64_setup_pagetables( - config, - &loader_elf, - kernel_first_vaddr, - kernel_first_paddr, - ), - Arch::X86_64 => unreachable!("x86_64 does not support creating a loader image"), - }; - let image_segment = loader_elf .segments - .into_iter() + .iter() .find(|segment| segment.loadable) .expect("Did not find loadable segment"); + + // Called "vaddr" but due to 1:1 mapping vaddr == paddr. let image_vaddr = image_segment.virt_addr; + // We have to clone here as the image executable is part of this function return object, // and the loader ELF is deserialised in this scope, so its lifetime will be shorter than // the return object. @@ -458,14 +489,6 @@ impl<'a> Loader<'a> { panic!("The loader entry point must be the first byte in the image"); } - for (var_addr, var_size, var_data) in pagetable_vars { - let offset = var_addr - image_vaddr; - assert!(var_size == var_data.len() as u64); - assert!(offset > 0); - assert!(offset <= loader_image.len() as u64); - loader_image[offset as usize..(offset + var_size) as usize].copy_from_slice(&var_data); - } - let kernel_entry = kernel_elf.entry; // initial task virt + pv_offset == initial task physical, so @@ -477,11 +500,6 @@ impl<'a> Loader<'a> { ui_p_reg_start + (initial_task_vaddr_range.end - initial_task_vaddr_range.start); assert!(ui_p_reg_end > ui_p_reg_start); - // This clone isn't too bad as it is just a Vec<(u64, &[u8])> - let mut all_regions_with_loader = regions.clone(); - all_regions_with_loader.push((image_vaddr, &loader_image)); - check_non_overlapping(&all_regions_with_loader); - let mut region_metadata = Vec::new(); let mut offset: u64 = 0; for (addr, data) in ®ions { @@ -494,10 +512,77 @@ impl<'a> Loader<'a> { offset += data.len() as u64; } - let size = std::mem::size_of::() as u64 - + region_metadata.iter().fold(0_u64, |acc, x| { - acc + x.size + std::mem::size_of::() as u64 - }); + let partial_size = loader_image.len() as u64 + + mem::size_of::() as u64 + + (region_metadata.len() * mem::size_of::()) as u64 + + offset; + + let page_tables_paddr_start = image_vaddr + partial_size; + + let mut page_table_bytes = Vec::::new(); + match config.arch { + Arch::Aarch64 => { + let (ttbr0_el2, ttbr0_el1, ttbr1_el1) = Loader::aarch64_setup_pagetables( + config, + &loader_elf, + kernel_first_vaddr, + kernel_first_paddr, + page_tables_paddr_start, + &mut page_table_bytes, + ); + + write_symbol!( + loader_image, + image_vaddr, + loader_elf, + "aarch64_pt_ttbr0_el2", + ttbr0_el2 + ); + write_symbol!( + loader_image, + image_vaddr, + loader_elf, + "aarch64_pt_ttbr0_el1", + ttbr0_el1 + ); + write_symbol!( + loader_image, + image_vaddr, + loader_elf, + "aarch64_pt_ttbr1_el1", + ttbr1_el1 + ); + } + Arch::Riscv64 => { + let boot_lvl1_pt = Loader::riscv64_setup_pagetables( + config, + &loader_elf, + kernel_first_vaddr, + kernel_first_paddr, + page_tables_paddr_start, + &mut page_table_bytes, + ); + write_symbol!( + loader_image, + image_vaddr, + loader_elf, + "riscv64_boot_lvl1_pt", + boot_lvl1_pt + ); + } + Arch::X86_64 => unreachable!("x86_64 does not support creating a loader image"), + }; + + let size = partial_size + page_table_bytes.len() as u64; + + let mut all_regions_with_loader: Vec<_> = regions + .iter() + .map(|&(base, data)| (base, data.len() as u64)) + .collect(); + all_regions_with_loader.push((image_vaddr, size)); + check_non_overlapping(&all_regions_with_loader); + + // TODO: Check contained within real RAM. let header = LoaderHeader64 { magic, @@ -516,6 +601,7 @@ impl<'a> Loader<'a> { header, region_metadata, regions, + page_table_bytes, word_size: kernel_elf.word_size, elf_machine: kernel_elf.machine, entry: loader_elf.entry, @@ -539,6 +625,10 @@ impl<'a> Loader<'a> { bytes.extend_from_slice(data); } + bytes.extend_from_slice(&self.page_table_bytes); + + assert!(bytes.len() as u64 == self.header.size); + bytes } @@ -603,95 +693,194 @@ impl<'a> Loader<'a> { } } + /// RISC-V 64 page tables for our purposes uses the Sv39 translation scheme + /// (3-level page tables). + /// + /// It is split into two halves: the Upper/Kernel part of the page tables, + /// which matches the format seL4 expects. The lower half contains an + /// identity mapped region for the loader. + /// + /// ```txt + /// (512 GiB) + /// 512 +---- Level 1 ---+ 2^39 + /// | | + /// | (empty) | + /// | | + /// k+1 +----------------+ (1 GiB) + /// | Level 2 Kernel | ----------> +---- Level 2 ---+ +-------------+ + /// k +----------------+ | | ----------> | 2 MiB block | + /// | | 511 |----------------| +-------------+ + /// | | | | ----------> | 2 MiB block | + /// | | 510 |----------------| +-------------+ + /// | | | | ----------> | 2 MiB block | + /// | | |----------------| +------------- + /// | | (...) (...) (...) Kernel Regions + /// | | |----------------| +-------------+ + /// | | | | ----------> | 2 MiB block | + /// | | l+1 |----------------| +-------------+ + /// | | | Level 3 Kernel | ----+ + /// | | l |----------------| | + /// | | | | | (2 MiB) + /// | | | | +-----> +-- Level 3 --+ +------------+ + /// | | | | | | ----------> | 4 KiB page | + /// | | | | 511 |-------------| +------------+ + /// | | | (empty) | | | ----------> | 4 KiB page | + /// | (empty) | | | |-------------| +------------+ + /// | | | | | | ----------> | 4 KiB page | + /// | | | | m |-------------| +------------+ p + /// | | | | | (empty) | + /// | | | | +-------------+ + /// | | | | + /// | | 0 +----------------+ + /// | | + /// | | + /// | | + /// | | + /// | | + /// s+1 +----------------+ (1 GiB) + /// | Level 2 Loader | ----------> +-- Level 2 --+ +-------------+ + /// s +----------------+ | | ----------> | 2 MiB block | + /// | | 511 +-------------+ +-------------+ + /// | | | | ----------> | 2 MiB block | + /// | (empty) | 510 +-------------+ +-------------+ + /// | | | | ----------> | 2 MiB block | + /// | | |-------------| +-------------+ + /// 0 +----------------+ | | ----------> | 2 MiB block | + /// |-------------| +-------------+ + /// (...) (...) (...) Loader Regions + /// |-------------| +-------------+ + /// | | ----------> | 2 MiB block | + /// |-------------| +-------------+ + /// | | ----------> | 2 MiB block | + /// t +-------------+ +-------------+ + /// | | + /// | (empty) | + /// | | + /// +-------------+ + /// + /// + /// Where: + /// k = align_down(kernel_first_vaddr, 1GiB), + /// l = align_down(kernel_first_vaddr, 2MiB), + /// m = align_down(kernel_first_vaddr, 4KiB), + /// p = align_down(kernel_first_paddr, 4KiB), + /// + /// s = align_down(text_addr, 1GiB), + /// t = align_down(text_addr, 2MiB), + /// ``` + /// fn riscv64_setup_pagetables( config: &Config, elf: &ElfFile, - first_vaddr: u64, - first_paddr: u64, - ) -> Vec<(u64, u64, [u8; PAGE_TABLE_SIZE])> { + kernel_first_vaddr: u64, + kernel_first_paddr: u64, + page_tables_paddr_start: u64, + page_table_bytes: &mut Vec, + ) -> u64 { + use riscv64::{pt_index, pte_leaf, pte_next, BLOCK_BITS_1GB, BLOCK_BITS_2MB, PAGE_BITS_4K}; + let (text_addr, _) = grab_symbol!(elf, "_text"); - let (boot_lvl1_pt_addr, boot_lvl1_pt_size) = grab_symbol!(elf, "boot_lvl1_pt"); - let (boot_lvl2_pt_addr, boot_lvl2_pt_size) = grab_symbol!(elf, "boot_lvl2_pt"); - let (boot_lvl3_pt_addr, boot_lvl3_pt_size) = grab_symbol!(elf, "boot_lvl3_pt"); - let (boot_lvl2_pt_loader_addr, boot_lvl2_pt_loader_size) = - grab_symbol!(elf, "boot_lvl2_pt_loader"); // We map the loader using 2MB pages, so make sure the base is actually aligned. - assert!(text_addr.is_multiple_of(1 << riscv64::BLOCK_BITS_2MB)); + assert!(text_addr.is_multiple_of(1 << BLOCK_BITS_2MB)); + + const PAGE_TABLE_ENTRIES: usize = PAGE_TABLE_SIZE / mem::size_of::(); + + let mut serialise_page_table_to_paddr = { + let page_tables_paddr_start = { + let aligned_pt_paddr_start = + page_tables_paddr_start.next_multiple_of(PAGE_TABLE_SIZE as u64); + if aligned_pt_paddr_start != page_tables_paddr_start { + let alignment_diff = + (aligned_pt_paddr_start - page_tables_paddr_start) as usize; + page_table_bytes.resize(alignment_diff, 0); + } - let num_pt_levels = config.riscv_pt_levels.unwrap().levels(); + aligned_pt_paddr_start + }; - let mut boot_lvl1_pt: [u8; PAGE_TABLE_SIZE] = [0; PAGE_TABLE_SIZE]; - { - let text_index_lvl1 = riscv64::pt_index(num_pt_levels, text_addr, 1); - let pt_entry = riscv64::pte_next(boot_lvl2_pt_loader_addr); - let start = 8 * text_index_lvl1; - let end = start + 8; - boot_lvl1_pt[start..end].copy_from_slice(&pt_entry.to_le_bytes()); - } + // This maintains the current end of the PT array. + let mut next_pt_paddr = page_tables_paddr_start; - let mut boot_lvl2_pt_loader: [u8; PAGE_TABLE_SIZE] = [0; PAGE_TABLE_SIZE]; - { - let text_index_lvl2 = riscv64::pt_index(num_pt_levels, text_addr, 2); - for (page, i) in (text_index_lvl2..512).enumerate() { - let start = 8 * i; - let end = start + 8; - let addr = text_addr + ((page as u64) << riscv64::BLOCK_BITS_2MB); - let pt_entry = riscv64::pte_leaf(addr); - boot_lvl2_pt_loader[start..end].copy_from_slice(&pt_entry.to_le_bytes()); + move |page_table: &mut [u64; PAGE_TABLE_ENTRIES]| -> u64 { + let pt_paddr = next_pt_paddr; + page_table_bytes.extend(page_table.iter().flat_map(|pte| pte.to_le_bytes())); + next_pt_paddr += PAGE_TABLE_SIZE as u64; + page_table.fill(0); + pt_paddr } - } + }; - { - let index = riscv64::pt_index(num_pt_levels, first_vaddr, 1); - let start = 8 * index; - let end = start + 8; - boot_lvl1_pt[start..end] - .copy_from_slice(&riscv64::pte_next(boot_lvl2_pt_addr).to_le_bytes()); - } + let num_pt_levels = config.riscv_pt_levels.unwrap().levels(); + assert!(num_pt_levels == 3); + + // Manufacture the constants as per the diagram. + let k = align_down(kernel_first_vaddr, BLOCK_BITS_1GB); + let l = align_down(kernel_first_vaddr, BLOCK_BITS_2MB); + let m = align_down(kernel_first_vaddr, PAGE_BITS_4K); + let p = align_down(kernel_first_paddr, PAGE_BITS_4K); + + let s = align_down(text_addr, BLOCK_BITS_1GB); + let t = align_down(text_addr, BLOCK_BITS_2MB); + + // Manufacture the kernel page tables + let kernel_lvl2_pt_paddr = { + let mut lvl2_pt_kernel = [0u64; PAGE_TABLE_ENTRIES]; + + let mut paddr = p; + let index_l = pt_index(num_pt_levels, l, 2); - let mut boot_lvl3_pt: [u8; PAGE_TABLE_SIZE] = [0; PAGE_TABLE_SIZE]; - let mut boot_lvl2_pt: [u8; PAGE_TABLE_SIZE] = [0; PAGE_TABLE_SIZE]; - { - let mut index_lvl2 = riscv64::pt_index(num_pt_levels, first_vaddr, 2); - if !first_vaddr.is_multiple_of(1 << riscv64::BLOCK_BITS_2MB) { - let index_lvl3 = riscv64::pt_index(num_pt_levels, first_vaddr, 3); - for (page, i) in (index_lvl3..512).enumerate() { - let start = 8 * i; - let end = start + 8; - let addr = first_paddr + ((page as u64) << riscv64::PAGE_BITS_4K); - assert!(addr.is_multiple_of(1 << riscv64::PAGE_BITS_4K)); - let pt_entry = riscv64::pte_leaf(addr); - boot_lvl3_pt[start..end].copy_from_slice(&pt_entry.to_le_bytes()); + lvl2_pt_kernel[index_l] = if kernel_first_vaddr.is_multiple_of(1 << BLOCK_BITS_2MB) { + assert!(paddr.is_multiple_of(1 << BLOCK_BITS_2MB)); + let pte = pte_leaf(paddr); + paddr += 1 << BLOCK_BITS_2MB; + pte + } else { + let mut lvl3_pt_kernel = [0u64; PAGE_TABLE_ENTRIES]; + + let index_m = pt_index(num_pt_levels, m, 3); + + for index in index_m..512 { + lvl3_pt_kernel[index] = pte_leaf(paddr); + paddr += 1 << PAGE_BITS_4K; } - let start = 8 * index_lvl2; - let end = start + 8; - let lvl3_pt_entry = riscv64::pte_next(boot_lvl3_pt_addr); - assert!(boot_lvl3_pt_addr.is_multiple_of(1 << riscv64::PAGE_BITS_4K)); - boot_lvl2_pt[start..end].copy_from_slice(&lvl3_pt_entry.to_le_bytes()); - index_lvl2 += 1; + + let kernel_lvl3_pt_paddr = serialise_page_table_to_paddr(&mut lvl3_pt_kernel); + pte_next(kernel_lvl3_pt_paddr) + }; + + for index in (index_l + 1)..512 { + lvl2_pt_kernel[index] = pte_leaf(paddr); + paddr += 1 << BLOCK_BITS_2MB; } - let first_paddr_aligned = round_up(first_paddr, 1 << riscv64::BLOCK_BITS_2MB); - for (page, i) in (index_lvl2..512).enumerate() { - let start = 8 * i; - let end = start + 8; - let addr = first_paddr_aligned + ((page as u64) << riscv64::BLOCK_BITS_2MB); - assert!(addr.is_multiple_of(1 << riscv64::BLOCK_BITS_2MB)); - let pt_entry = riscv64::pte_leaf(addr); - boot_lvl2_pt[start..end].copy_from_slice(&pt_entry.to_le_bytes()); + + serialise_page_table_to_paddr(&mut lvl2_pt_kernel) + }; + + // Manufacture the loader page tables, which is relatively straightforward + let loader_lvl2_pt_paddr = { + let mut lvl2_pt_loader = [0u64; PAGE_TABLE_ENTRIES]; + + // Identity mapped, so vaddr == paddr. + let mut paddr = t; + + for index in pt_index(num_pt_levels, t, 2)..512 { + lvl2_pt_loader[index] = pte_leaf(paddr); + paddr += 1 << BLOCK_BITS_2MB; } - } - vec![ - (boot_lvl1_pt_addr, boot_lvl1_pt_size, boot_lvl1_pt), - (boot_lvl2_pt_addr, boot_lvl2_pt_size, boot_lvl2_pt), - (boot_lvl3_pt_addr, boot_lvl3_pt_size, boot_lvl3_pt), - ( - boot_lvl2_pt_loader_addr, - boot_lvl2_pt_loader_size, - boot_lvl2_pt_loader, - ), - ] + serialise_page_table_to_paddr(&mut lvl2_pt_loader) + }; + + // Manufacture the Level 1 table + let mut boot_lvl1_pt = [0u64; PAGE_TABLE_ENTRIES]; + + let index_s = pt_index(num_pt_levels, s, 1); + let index_k = pt_index(num_pt_levels, k, 1); + boot_lvl1_pt[index_k] = pte_next(kernel_lvl2_pt_paddr); + boot_lvl1_pt[index_s] = pte_next(loader_lvl2_pt_paddr); + + serialise_page_table_to_paddr(&mut boot_lvl1_pt) } /// AArch64 loader page tables have two variations: @@ -744,35 +933,8 @@ impl<'a> Loader<'a> { /// | | /// 1 +-------------+ (512 GiB) /// | Level 1 Lwr | ----------> +-- Level 1 --+ - /// 0 +-------------+ | | - /// | (empty) | - /// | | - /// u+1 +-------------+ +-------------+ - /// | uart_base | ----------> | 1 GiB block | - /// u +-------------+ +-------------+ - /// | | - /// | (empty) | - /// | | - /// i+1 +-------------+ (1 GiB) - /// | Level 2 Lwr | ----------> +-- Level 2 --+ - /// i +-------------+ | | - /// | | | (empty) | - /// | (empty) | | | - /// | | t +-------------+ +-------------+ - /// +-------------+ | | ----------> | 2 MiB block | - /// |-------------| +-------------+ - /// | | ----------> | 2 MiB block | - /// |-------------| +-------------+ - /// Loader Regions (...) (...) (...) - /// |-------------| +-------------+ - /// | | ----------> | 2 MiB block | - /// |-------------| +-------------+ - /// | | ----------> | 2 MiB block | - /// s +-------------+ +-------------+ - /// | | - /// | (empty) | - /// | | - /// +-------------+ + /// 0 +-------------+ TODO: RAM. + /// /// /// Where: /// k = align_down(kernel_first_vaddr, 512GiB), @@ -780,134 +942,442 @@ impl<'a> Loader<'a> { /// m = align_down(kernel_first_vaddr, 2MiB), /// p = align_down(kernel_first_paddr, 2MiB), /// u = align_down(uart_base, 1GiB), - /// i = align_down(loader_start_addr, 1GiB), - /// s = align_down(loader_start_addr, 2MiB), - /// t = align_up(loader_end_addr, 2MiB), /// ``` /// fn aarch64_setup_pagetables( - _config: &Config, + config: &Config, elf: &ElfFile, - first_vaddr: u64, - first_paddr: u64, - ) -> Vec<(u64, u64, [u8; PAGE_TABLE_SIZE])> { - use aarch64::s1_mair_attr_index::{MT_DEVICE_nGnRnE, MT_NORMAL}; - - let (boot_lvl1_lower_addr, boot_lvl1_lower_size) = grab_symbol!(elf, "boot_lvl1_lower"); - let (boot_lvl1_upper_addr, boot_lvl1_upper_size) = grab_symbol!(elf, "boot_lvl1_upper"); - let (boot_lvl2_upper_addr, boot_lvl2_upper_size) = grab_symbol!(elf, "boot_lvl2_upper"); - let (boot_lvl0_lower_addr, boot_lvl0_lower_size) = grab_symbol!(elf, "boot_lvl0_lower"); - let (boot_lvl0_upper_addr, boot_lvl0_upper_size) = grab_symbol!(elf, "boot_lvl0_upper"); - let (boot_lvl2_lower_addr, boot_lvl2_lower_size) = grab_symbol!(elf, "boot_lvl2_lower"); - - let (loader_start_addr, _) = grab_symbol!(elf, "_loader_start"); - let (loader_end_addr, _) = grab_symbol!(elf, "_loader_end"); - - if aarch64::lvl1_index(loader_start_addr) != aarch64::lvl1_index(loader_end_addr) { - panic!("We only map 1GiB, but loader paddr range covers multiple GiB"); - } + kernel_first_vaddr: u64, + kernel_first_paddr: u64, + page_tables_paddr_start: u64, + page_table_bytes: &mut Vec, + ) -> (u64, u64, u64) { + use aarch64::{ + block_descriptor, lvl0_index, lvl1_index, lvl2_index, lvl3_index, page_descriptor, + s1_mair_attr_index::{MT_DEVICE_nGnRnE, MT_NORMAL}, + table_descriptor, BLOCK_BITS_1GB, BLOCK_BITS_2MB, BLOCK_BITS_512GB, PAGE_BITS_4KB, + }; - let mut boot_lvl0_lower: [u8; PAGE_TABLE_SIZE] = [0; PAGE_TABLE_SIZE]; - { - let pt_entry = aarch64::table_descriptor(boot_lvl1_lower_addr); - boot_lvl0_lower[..8].copy_from_slice(&pt_entry.to_le_bytes()); - } + const PAGE_TABLE_ENTRIES: usize = PAGE_TABLE_SIZE / mem::size_of::(); - let mut boot_lvl1_lower: [u8; PAGE_TABLE_SIZE] = [0; PAGE_TABLE_SIZE]; + let mut serialise_page_table_to_paddr = { + let page_tables_paddr_start = { + let aligned_pt_paddr_start = + page_tables_paddr_start.next_multiple_of(PAGE_TABLE_SIZE as u64); + if aligned_pt_paddr_start != page_tables_paddr_start { + let alignment_diff = + (aligned_pt_paddr_start - page_tables_paddr_start) as usize; + page_table_bytes.resize(alignment_diff, 0); + } - // map optional UART MMIO in l1 1GB page, only available if CONFIG_PRINTING - if let Ok((uart_addr, uart_addr_size)) = elf.find_symbol("uart_addr") { - let data = elf - .get_data(uart_addr, uart_addr_size) - .expect("uart_addr not initialized"); + aligned_pt_paddr_start + }; - let uart_base = u64::from_le_bytes(data[0..8].try_into().unwrap()); + // This maintains the current end of the PT array. + let mut next_pt_paddr = page_tables_paddr_start; - let lvl1_idx = aarch64::lvl1_index(uart_base); + move |page_table: &mut [u64; PAGE_TABLE_ENTRIES]| -> u64 { + let pt_paddr = next_pt_paddr; + page_table_bytes.extend(page_table.iter().flat_map(|pte| pte.to_le_bytes())); + next_pt_paddr += PAGE_TABLE_SIZE as u64; + page_table.fill(0); + pt_paddr + } + }; - let pt_entry = aarch64::block_descriptor(1, uart_base, MT_DEVICE_nGnRnE); + let identity_mapped_regions = { + let ram_regions = config + .normal_regions + .as_ref() + .expect("AArch64 should have normal_regions"); + + // println!("{:#x?}", ram_regions); + + let mut regions: Vec<_> = ram_regions + .iter() + .cloned() + .map(|region| (region, MT_DEVICE_nGnRnE)) + .collect(); + + // FIXME: Derive from the kernel build system. + if let Some(uart_base) = read_symbol_maybe(elf, "uart_addr") { + let uart_base = align_down(uart_base, PAGE_BITS_4KB); + regions.push(( + PlatformConfigRegion { + start: uart_base, + end: uart_base + (1 << PAGE_BITS_4KB), + }, + MT_DEVICE_nGnRnE, + )); + } - let start = 8 * lvl1_idx; - let end = 8 * (lvl1_idx + 1); - boot_lvl1_lower[start..end].copy_from_slice(&pt_entry.to_le_bytes()); - } + // FIXME: This is currently assuming implementation details of the BCM2711/ + // Raspberry Pi 4B spin table implementation, as it is the only + // platform we have that uses spin tables. Specifically, that + // it is always located at the 0 page. + if elf.find_symbol("cpus_release_addr").is_ok() { + regions.push(( + PlatformConfigRegion { + start: 0x0, + end: 1 << PAGE_BITS_4KB, + }, + MT_DEVICE_nGnRnE, + )); + } - let mut boot_lvl2_lower: [u8; PAGE_TABLE_SIZE] = [0; PAGE_TABLE_SIZE]; + regions.sort_by_key(|(region, _)| region.start); - // 1GB lvl1 Table entry - let pt_entry = aarch64::table_descriptor(boot_lvl2_lower_addr); - let lvl1_idx = aarch64::lvl1_index(loader_start_addr); - let start = 8 * lvl1_idx; - let end = 8 * (lvl1_idx + 1); - boot_lvl1_lower[start..end].copy_from_slice(&pt_entry.to_le_bytes()); + regions + }; - // map the loader 1:1 access into 2MB lvl2 Block entries for a 4KB granule - let lvl2_idx = aarch64::lvl2_index(loader_start_addr); - for i in lvl2_idx..=aarch64::lvl2_index(loader_end_addr) { - let entry_idx: u64 = - ((i - aarch64::lvl2_index(loader_start_addr)) << aarch64::BLOCK_BITS_2MB) as u64; + // Manufacture the constants as per the diagram. + let k = align_down(kernel_first_vaddr, BLOCK_BITS_512GB); + let l = align_down(kernel_first_vaddr, BLOCK_BITS_1GB); + let m = align_down(kernel_first_vaddr, BLOCK_BITS_2MB); + let p = align_down(kernel_first_paddr, BLOCK_BITS_2MB); + + // Manufacture the kernel page tables, which is relatively straightforward. + let kernel_lvl1_pt_paddr = { + // First, the Level 2 Upr table. + let lvl2_pt_paddr = { + let mut lvl2_pt_kernel = [0u64; PAGE_TABLE_ENTRIES]; + + let mut vaddr = m; + let mut paddr = p; + while lvl1_index(m) == lvl1_index(vaddr) { + lvl2_pt_kernel[lvl2_index(vaddr)] = block_descriptor(2, paddr, MT_NORMAL); + + vaddr += 1 << BLOCK_BITS_2MB; + paddr += 1 << BLOCK_BITS_2MB; + } - let pt_entry = - aarch64::block_descriptor(2, loader_start_addr + entry_idx, MT_DEVICE_nGnRnE); + serialise_page_table_to_paddr(&mut lvl2_pt_kernel) + }; - let start = 8 * i; - let end = 8 * (i + 1); - boot_lvl2_lower[start..end].copy_from_slice(&pt_entry.to_le_bytes()); - } + // Then, the Level 1 Upr table. + let mut lvl1_pt_kernel = [0u64; PAGE_TABLE_ENTRIES]; + lvl1_pt_kernel[lvl1_index(l)] = table_descriptor(lvl2_pt_paddr); - // TODO: this is a complete hack specific to BCM2711/Raspberry Pi 4B and - // will be removed with patches that re-do this loader mapping code. - if elf.find_symbol("cpus_release_addr").is_ok() { - let lvl2_idx = aarch64::lvl2_index(0); - // Make sure we don't override the loader mappings done above. - assert!(aarch64::lvl2_index(loader_start_addr) != lvl2_idx); - assert!(aarch64::lvl1_index(loader_start_addr) == aarch64::lvl1_index(0)); + serialise_page_table_to_paddr(&mut lvl1_pt_kernel) + }; - let pt_entry = aarch64::block_descriptor(2, lvl2_idx as u64, MT_DEVICE_nGnRnE); + // Manufacture the RAM page tables, which is a little bit more complicated. + // We assume that normal RAM lies between 0 <= paddr < 512GiB, i.e. + // that lvl0_index(any ram region addr) = 0. + let ram_lvl1_pt_paddr = { + // Validation of assumptions about the identity mapped regions. + let mut previous_end = None; + for (region, _) in identity_mapped_regions.iter() { + assert!(lvl0_index(region.start) == 0); + assert!(lvl0_index(region.end - 1) == 0); + // This is probably an unnecessary assumption. + assert!(region.start.is_multiple_of(4096)); + assert!(region.end.is_multiple_of(4096)); + // This is definitely necessary. + assert!(region.start >= previous_end.unwrap_or(0)); + previous_end = Some(region.end); + } - let start = 8 * lvl2_idx; - let end = 8 * (lvl2_idx + 1); - boot_lvl2_lower[start..end].copy_from_slice(&pt_entry.to_le_bytes()); - } + // We maintain three active page tables, which contain our previous + // known page table data. As we process regions in ascending order, + // once we have exceeded the bounds of the current reservation we + // can simply push to the page_table_bytes storage and insert into + // the parent PT the descriptor. + // When the current vaddr (/paddr, as identity mapped) exceeds the + // top value we rotate to a new PT. + + let mut lvl1_pt = [0u64; PAGE_TABLE_ENTRIES]; + let mut lvl2_pt = [0u64; PAGE_TABLE_ENTRIES]; + let mut lvl3_pt = [0u64; PAGE_TABLE_ENTRIES]; + // TODO: These should be defines. Note that the top is the size of 1 level of the next level up. + // TODO: LVL1_ENTRY_RANGE? idk + #[allow(unused_mut)] + let mut lvl1_vaddr_top = 1 << BLOCK_BITS_512GB; + let mut lvl2_vaddr_top = 1 << BLOCK_BITS_1GB; + let mut lvl3_vaddr_top = 1 << BLOCK_BITS_2MB; + + // TODO: Tests... + // This is similar to aligned_power_of_two_regions() for the kernel UT, + // but we restrict it such that the output always is either 1GB, 2MB, or 4KB + // pages. + + // Allowed externally for the final iteration + let mut base = 0u64; + for &(ref region, attr_index) in identity_mapped_regions.iter() { + // println!("RAM Region: {:#x}..{:#x}", base, region.end); + // println!( + // " - Current Lvl1: {:#x}..{:#x}, entries: {}", + // (lvl1_vaddr_top - (1 << BLOCK_BITS_512GB)), + // lvl1_vaddr_top, + // lvl1_pt.iter().filter(|&&v| v != 0).count() + // ); + // println!( + // " - Current Lvl2: {:#x}..{:#x}, entries: {}", + // (lvl2_vaddr_top - (1 << BLOCK_BITS_1GB)), + // lvl2_vaddr_top, + // lvl2_pt.iter().filter(|&&v| v != 0).count() + // ); + // println!( + // " - Current Lvl3: {:#x}..{:#x}, entries: {}", + // (lvl3_vaddr_top - (1 << BLOCK_BITS_2MB)), + // lvl3_vaddr_top, + // lvl3_pt.iter().filter(|&&v| v != 0).count() + // ); + + // Handle the fact that the regions are not contiguous and that + // we might need to skip PT. - let mut boot_lvl0_upper: [u8; PAGE_TABLE_SIZE] = [0; PAGE_TABLE_SIZE]; - { - let pt_entry = aarch64::table_descriptor(boot_lvl1_upper_addr); - let idx = aarch64::lvl0_index(first_vaddr); - // For EL2. - boot_lvl0_lower[8 * idx..8 * (idx + 1)].copy_from_slice(&pt_entry.to_le_bytes()); - // For EL1. - boot_lvl0_upper[8 * idx..8 * (idx + 1)].copy_from_slice(&pt_entry.to_le_bytes()); - } + { + if region.start >= lvl3_vaddr_top { + if lvl3_pt != [0; _] { + let lvl3_pt_paddr = serialise_page_table_to_paddr(&mut lvl3_pt); + // println!("[iter] Serialise lvl3 table: {lvl3_pt_paddr:#x} for to {:#x}..{lvl3_vaddr_top:#x}", (lvl3_vaddr_top - (1 << BLOCK_BITS_2MB))); + assert!(lvl2_pt[lvl2_index(base)] == 0); + lvl2_pt[lvl2_index(base)] = table_descriptor(lvl3_pt_paddr); + } + + // TODO: just compute it. + while region.start >= lvl3_vaddr_top { + lvl3_vaddr_top += 1 << BLOCK_BITS_2MB; + } + } - let mut boot_lvl1_upper: [u8; PAGE_TABLE_SIZE] = [0; PAGE_TABLE_SIZE]; - { - let pt_entry = aarch64::table_descriptor(boot_lvl2_upper_addr); - let idx = aarch64::lvl1_index(first_vaddr); - boot_lvl1_upper[8 * idx..8 * (idx + 1)].copy_from_slice(&pt_entry.to_le_bytes()); - } + if region.start >= lvl2_vaddr_top { + if lvl2_pt != [0; _] { + let lvl2_pt_paddr = serialise_page_table_to_paddr(&mut lvl2_pt); + // println!("[iter] Serialise lvl2 table: {lvl2_pt_paddr:#x} for to {:#x}..{lvl2_vaddr_top:#x}, base: {:#x} lvl1_index(base): {:#x}", (lvl2_vaddr_top - (1 << BLOCK_BITS_1GB)), base, lvl1_index(base)); + assert!(lvl1_pt[lvl1_index(base)] == 0); + lvl1_pt[lvl1_index(base)] = table_descriptor(lvl2_pt_paddr); + } + + // TODO: just compute it. + while region.start >= lvl2_vaddr_top { + lvl2_vaddr_top += 1 << BLOCK_BITS_1GB; + } + } - let mut boot_lvl2_upper: [u8; PAGE_TABLE_SIZE] = [0; PAGE_TABLE_SIZE]; + if region.start >= lvl1_vaddr_top { + unreachable!( + "impossible as everything should fit here: {lvl1_vaddr_top:#x}" + ); + } + } - let lvl2_idx = aarch64::lvl2_index(first_vaddr); - for i in lvl2_idx..512 { - let entry_idx: u64 = - ((i - aarch64::lvl2_index(first_vaddr)) << aarch64::BLOCK_BITS_2MB) as u64; + // After serialising the old base, update the new one. + base = region.start; + + // Inner Loop: + // Invariant: the page tables in lvl1_pt, lvl2_pt, lvl3_pt + // are either (1) for the current address range, + // or (2) are empty and for a lower level than the current level. + // Also, the values in lvlXXX_vaddr_top are always correct (even if empty) + // Also contiguous within the loop. + // Loop entry: (1) holds by work at the start of each region + while base != region.end { + // Condition is !=, but assert that we never skip it. + assert!(base < region.end); + + let size_bits = region.end.wrapping_sub(base).ilog2(); + let align_bits = min( + size_bits, + // FIXME: Once MSRV is > 1.97, use .lowest_one() method. + if base == 0 { + size_bits + } else { + base.trailing_zeros() + }, + ); + + // Match the size and alignment of the current region to + // the valid PT region sizes. + let (level, bits) = match u64::from(align_bits) { + BLOCK_BITS_1GB.. => (1, BLOCK_BITS_1GB), + BLOCK_BITS_2MB.. => (2, BLOCK_BITS_2MB), + PAGE_BITS_4KB.. => (3, PAGE_BITS_4KB), + 0.. => panic!("impossible; regions should be aligned to 4K at least"), + }; + + let pt_region_size = 1u64 << bits; + let top = base + pt_region_size; + + // println!("- Aligned PT region: {:#x}..{:#x} (size_bits: {}, align_bits: {}, bits: {})", base, top, size_bits, align_bits, bits); + // println!( + // " - Current Lvl1: {:#x}..{:#x}, entries: {}", + // (lvl1_vaddr_top - (1 << BLOCK_BITS_512GB)), + // lvl1_vaddr_top, + // lvl1_pt.iter().filter(|&&v| v != 0).count() + // ); + // println!( + // " - Current Lvl2: {:#x}..{:#x}, entries: {}", + // (lvl2_vaddr_top - (1 << BLOCK_BITS_1GB)), + // lvl2_vaddr_top, + // lvl2_pt.iter().filter(|&&v| v != 0).count() + // ); + // println!( + // " - Current Lvl3: {:#x}..{:#x}, entries: {}", + // (lvl3_vaddr_top - (1 << BLOCK_BITS_2MB)), + // lvl3_vaddr_top, + // lvl3_pt.iter().filter(|&&v| v != 0).count() + // ); + + match level { + 1 => { + // If it belongs in Level 1 PT, then it must go in + // lvl1 pt. By the inavariant, base < lvl1_vaddr_top. + assert!(base < lvl1_vaddr_top); + // top is <= lvl1_vaddr_top (the case where it is the topmost entry) + assert!(top <= lvl1_vaddr_top); + + assert!(lvl1_pt[lvl1_index(base)] == 0); + lvl1_pt[lvl1_index(base)] = block_descriptor(1, base, attr_index); + + if top == lvl1_vaddr_top { + // Invariant maintenance: if the new top would be now equal + // the end of the page table's region top, we need a new + // page table object and add it to the list. + + // This should be possible to handle - we just need to break out of this loop + todo!("handle the case where top of lvl1 is occupied - this would be near the top of 512GiB"); + } + + // Invariant: Lower levels are empty. + assert!(lvl2_pt == [0; _]); + assert!(lvl3_pt == [0; _]); + // Invariant maintenance: vaddr_top is right range for current PT. + // it's empty so we need to increment the top to be current top (1G aligned) + 2MIB (512 lvl3 entries) + lvl3_vaddr_top = top + (1 << BLOCK_BITS_2MB); + // it's empty so we need to increment the top to be current top (1G aligned) + 1G (512 lvl2 entries) + lvl2_vaddr_top = top + (1 << BLOCK_BITS_1GB); + } + 2 => { + // If it is a 2MiB block, it must go in the Level 2 PT; + // by our invariants: base < lvl2_vaddr_top and top <= lvl2_vaddr_top + assert!(base < lvl2_vaddr_top); + assert!(top <= lvl2_vaddr_top); + + assert!(lvl2_pt[lvl2_index(base)] == 0); + lvl2_pt[lvl2_index(base)] = block_descriptor(2, base, attr_index); + + if top == lvl2_vaddr_top { + // Invariant maintenance: keep for current address range. + // As we're the top of the range, we can serialise the table. + + let lvl2_pt_paddr = serialise_page_table_to_paddr(&mut lvl2_pt); + // println!("Serialise lvl2 table: {lvl2_pt_paddr:#x} up to {lvl2_vaddr_top:#x}"); + lvl2_vaddr_top += 1 << BLOCK_BITS_1GB; + + lvl1_pt[lvl1_index(base)] = table_descriptor(lvl2_pt_paddr); + + if top == lvl1_vaddr_top { + todo!("handle the case where top of lvl1 is occupied - this would be near the top of 512GiB"); + } + } + + // Invariant: Lower levels are empty. + assert!(lvl3_pt == [0; _]); + // Invariant maintenance: vaddr_top is right range for current PT. + // it's empty so we need to increment the top to be current top (2MIB aligned) + 2MIB (512 lvl3 entries) + lvl3_vaddr_top = top + (1 << BLOCK_BITS_2MB); + } + 3 => { + // If it is a 4K page, it must go in the Level 3 PT; + // by our invariants: base < lvl3_vaddr_top and top <= lvl3_vaddr_top + assert!(base < lvl3_vaddr_top); + assert!(top <= lvl3_vaddr_top); + + assert!(lvl3_pt[lvl3_index(base)] == 0); + lvl3_pt[lvl3_index(base)] = page_descriptor(base, attr_index); + + if top == lvl3_vaddr_top { + // Invariant maintenance: keep for current address range. + // As we're the top of the range, we can serialise the table. + + let lvl3_pt_paddr = serialise_page_table_to_paddr(&mut lvl3_pt); + // println!("Serialise lvl3 table: {lvl3_pt_paddr:#x} for to {:#x}..{lvl3_vaddr_top:#x}", (lvl3_vaddr_top - (1 << BLOCK_BITS_2MB))); + lvl3_vaddr_top += 1 << BLOCK_BITS_2MB; + + assert!(lvl2_pt[lvl2_index(base)] == 0); + lvl2_pt[lvl2_index(base)] = table_descriptor(lvl3_pt_paddr); + + if top == lvl2_vaddr_top { + let lvl2_pt_paddr = serialise_page_table_to_paddr(&mut lvl2_pt); + // println!("Serialise lvl2 table: {lvl2_pt_paddr:#x} for to {:#x}..{lvl2_vaddr_top:#x}", (lvl2_vaddr_top - (1 << BLOCK_BITS_1GB))); + lvl2_vaddr_top += 1 << BLOCK_BITS_1GB; + + assert!(lvl1_pt[lvl1_index(base)] == 0); + lvl1_pt[lvl1_index(base)] = table_descriptor(lvl2_pt_paddr); + + if top == lvl1_vaddr_top { + todo!("handle the case where top of lvl1 is occupied - this would be near the top of 512GiB"); + } + } + } + + // Invariant: lower levels empty is vacuuously true + } + _ => unreachable!("level is 1..=3"), + } - let pt_entry = aarch64::block_descriptor(2, first_paddr + entry_idx, MT_NORMAL); + base = base + pt_region_size; + } + } - let start = 8 * i; - let end = 8 * (i + 1); - boot_lvl2_upper[start..end].copy_from_slice(&pt_entry.to_le_bytes()); - } + // By the loop invariant, we know that anything before has been serialised. + // However, as we are at the end of the loop now, we might have + // page tables that have been partially filled out, and we need to + // serialise these. + + if lvl3_pt != [0; _] { + let lvl3_pt_paddr = serialise_page_table_to_paddr(&mut lvl3_pt); + // println!("[end] Serialise lvl3 table: {lvl3_pt_paddr:#x}"); + assert!(lvl2_pt[lvl2_index(base)] == 0); + lvl2_pt[lvl2_index(base)] = table_descriptor(lvl3_pt_paddr); + } - vec![ - (boot_lvl0_lower_addr, boot_lvl0_lower_size, boot_lvl0_lower), - (boot_lvl1_lower_addr, boot_lvl1_lower_size, boot_lvl1_lower), - (boot_lvl0_upper_addr, boot_lvl0_upper_size, boot_lvl0_upper), - (boot_lvl1_upper_addr, boot_lvl1_upper_size, boot_lvl1_upper), - (boot_lvl2_upper_addr, boot_lvl2_upper_size, boot_lvl2_upper), - (boot_lvl2_lower_addr, boot_lvl2_lower_size, boot_lvl2_lower), - ] + if lvl2_pt != [0; _] { + let lvl2_pt_paddr = serialise_page_table_to_paddr(&mut lvl2_pt); + // println!("[end] Serialise lvl2 table: {lvl2_pt_paddr:#x} for to {:#x}..{lvl2_vaddr_top:#x}, base: {:#x} lvl1_index(base): {:#x}", (lvl2_vaddr_top - (1 << BLOCK_BITS_1GB)), base, lvl1_index(base)); + assert!(lvl1_pt[lvl1_index(base)] == 0); + lvl1_pt[lvl1_index(base)] = table_descriptor(lvl2_pt_paddr); + } + + // the level1 pt should not be empty. lol. + assert!(lvl1_pt != [0; _]); + + // println!("New lvl1 table"); + serialise_page_table_to_paddr(&mut lvl1_pt) + }; + + // Depending on whether we are in hypervisor mode, we either need to + // return the TTBR0_EL2 or TTBR[0,1]_EL1 values. We return u64::MAX + // so as to return garbage - an unaligned address outside of physical + // memory. + if config.hypervisor { + // Manufacture the Level 0 table, containing the kernel table + // and the RAM tables. + + let mut ttbr0_el2_pt = [0u64; PAGE_TABLE_ENTRIES]; + + assert!(lvl0_index(k) != lvl0_index(0)); + ttbr0_el2_pt[lvl0_index(k)] = table_descriptor(kernel_lvl1_pt_paddr); + ttbr0_el2_pt[lvl0_index(0)] = table_descriptor(ram_lvl1_pt_paddr); + + let ttbr0_el2 = serialise_page_table_to_paddr(&mut ttbr0_el2_pt); + + (ttbr0_el2, u64::MAX, u64::MAX) + } else { + let mut ttbr0_el1_pt = [0u64; PAGE_TABLE_ENTRIES]; + let mut ttbr1_el1_pt = [0u64; PAGE_TABLE_ENTRIES]; + + // Kernel in TTBR1 (Upper) + ttbr1_el1_pt[lvl0_index(k)] = table_descriptor(kernel_lvl1_pt_paddr); + // Loader in TTBR0 (Lower) + ttbr0_el1_pt[lvl0_index(0)] = table_descriptor(ram_lvl1_pt_paddr); + + let ttbr0_el1 = serialise_page_table_to_paddr(&mut ttbr0_el1_pt); + let ttbr1_el1 = serialise_page_table_to_paddr(&mut ttbr1_el1_pt); + + (u64::MAX, ttbr0_el1, ttbr1_el1) + } } } diff --git a/tool/microkit/src/sel4.rs b/tool/microkit/src/sel4.rs index 641fa63c6..e43363699 100644 --- a/tool/microkit/src/sel4.rs +++ b/tool/microkit/src/sel4.rs @@ -249,7 +249,7 @@ pub fn emulate_kernel_boot( } } -#[derive(Deserialize)] +#[derive(Deserialize, Debug, Clone)] pub struct PlatformConfigRegion { pub start: u64, pub end: u64, diff --git a/tool/microkit/src/util.rs b/tool/microkit/src/util.rs index 6f2c0e275..211c794d6 100644 --- a/tool/microkit/src/util.rs +++ b/tool/microkit/src/util.rs @@ -54,6 +54,14 @@ pub const fn round_down(n: u64, x: u64) -> u64 { } } +pub const fn align_up(n: u64, bits: u64) -> u64 { + round_up(n, 1 << bits) +} + +pub const fn align_down(n: u64, bits: u64) -> u64 { + round_down(n, 1 << bits) +} + pub fn is_power_of_two(n: u64) -> bool { assert!(n > 0); n & (n - 1) == 0