-#endif
-
-#define PFM_NONE 0x0 /* No page faults expected. Must be a kernel bug */
-#define PFM_KILL 0x1 /* On fault kill user process. */
-
-
-/*
- * Macros to build GDT entries in assembly.
- */
-#define SEG_NULL \
- .word 0, 0; \
- .byte 0, 0, 0, 0
-#define SEG(type,base,lim) \
- .word ((lim)&0xffff), ((base)&0xffff); \
- .byte (((base)>>16)&0xff), (0x90|(type)), \
- (0xc0|(((lim)>>16)&0xf)), (((base)>>24)&0xff)
-
-
-
-/* Page Table/Directory Entry flags
- * these are defined by the hardware
- */
-#define PG_P 0x1 /* Present */
-#define PG_W 0x2 /* Writeable */
-#define PG_U 0x4 /* User */
-#define PG_PWT 0x8 /* Write-Through */
-#define PG_PCD 0x10 /* Cache-Disable */
-#define PG_A 0x20 /* Accessed */
-#define PG_D 0x40 /* Dirty */
-#define PG_PS 0x80 /* Page Size */
-#define PG_MBZ 0x180 /* Bits must be zero */
-#define PG_USER 0xe00 /* Bits for user processes */
-/*
- * The PG_USER bits are not used by the kernel and they are
- * not interpreted by the hardware. The kernel allows
- * user processes to set them arbitrarily.
- */
-
-
-
-/* Control Register flags */
-#define CR0_PE 0x1 /* Protection Enable */
-#define CR0_MP 0x2 /* Monitor coProcessor */
-#define CR0_EM 0x4 /* Emulation */
-#define CR0_TS 0x8 /* Task Switched */
-#define CR0_ET 0x10 /* Extension Type */
-#define CR0_NE 0x20 /* Numeric Errror */
-#define CR0_WP 0x10000 /* Write Protect */
-#define CR0_AM 0x40000 /* Alignment Mask */
-#define CR0_NW 0x20000000 /* Not Writethrough */
-#define CR0_CD 0x40000000 /* Cache Disable */
-#define CR0_PG 0x80000000 /* Paging */
-
-#define CR4_PCE 0x100 /* Performance counter enable */
-#define CR4_MCE 0x40 /* Machine Check Enable */
-#define CR4_PSE 0x10 /* Page Size Extensions */
-#define CR4_DE 0x08 /* Debugging Extensions */
-#define CR4_TSD 0x04 /* Time Stamp Disable */
-#define CR4_PVI 0x02 /* Protected-Mode Virtual Interrupts */
-#define CR4_VME 0x01 /* V86 Mode Extensions */
-
-/* EFLAGS Register. */
-#define FLAG_CF 0x00000001 /* Carry Flag. */
-#define FLAG_PF 0x00000004 /* Parity Flag. */
-#define FLAG_AF 0x00000010 /* Auxiliary Carry Flag. */
-#define FLAG_ZF 0x00000040 /* Zero Flag. */
-#define FLAG_SF 0x00000080 /* Sign Flag. */
-#define FLAG_TF 0x00000100 /* Trap Flag. */
-#define FLAG_IF 0x00000200 /* Interrupt Flag. */
-#define FLAG_DF 0x00000400 /* Direction Flag. */
-#define FLAG_OF 0x00000800 /* Overflow Flag. */
-#define FLAG_IOPL 0x00003000 /* I/O Privilege Level (2 bits). */
-#define FLAG_IOPL_SHIFT 12
-#define FLAG_NT 0x00004000 /* Nested Task. */
-#define FLAG_RF 0x00010000 /* Resume Flag. */
-#define FLAG_VM 0x00020000 /* Virtual 8086 Mode. */
-#define FLAG_AC 0x00040000 /* Alignment Check. */
-#define FLAG_VIF 0x00080000 /* Virtual Interrupt Flag. */
-#define FLAG_VIP 0x00100000 /* Virtual Interrupt Pending. */
-#define FLAG_ID 0x00200000 /* ID Flag. */
-
-/* Page fault error codes */
-#define FEC_PR 0x1 /* Page fault caused by protection violation */
-#define FEC_WR 0x2 /* Page fault caused by a write */
-#define FEC_U 0x4 /* Page fault occured while in user mode */
-
-
-/* Application segment type bits */
-#define STA_X 0x8 /* Executable segment */
-#define STA_A 0x1 /* Accessed */
-
-#define STA_C 0x4 /* Conforming code segment (executable only) */
-#define STA_R 0x2 /* Readable (executable segments) */
-
-#define STA_E 0x4 /* Expand down (non-executable segments) */
-#define STA_W 0x2 /* Writeable (non-executable segments) */
-
-
-/* Segment selectors. */
-#define SEL_NULL 0x00 /* Null selector. */
-#define SEL_KCSEG 0x08 /* Kernel code selector. */
-#define SEL_KDSEG 0x10 /* Kernel data selector. */
-#define SEL_UCSEG 0x18 /* Kernel code selector. */
-#define SEL_UDSEG 0x20 /* Kernel data selector. */
-#define SEL_TSS 0x28 /* Task-state segment. */
-#define SEL_CNT 6 /* Number of segments. */
-
-#ifndef __ASSEMBLER__
-struct tss
- {
- uint16_t back_link, :16;
- uint32_t esp0;
- uint16_t ss0, :16;
- uint32_t esp1;
- uint16_t ss1, :16;
- uint32_t esp2;
- uint16_t ss2, :16;
- uint32_t cr3;
- uint32_t eip;
- uint32_t eflags;
- uint32_t eax, ecx, edx, ebx;
- uint32_t esp, ebp, esi, edi;
- uint16_t es, :16;
- uint16_t cs, :16;
- uint16_t ss, :16;
- uint16_t ds, :16;
- uint16_t fs, :16;
- uint16_t gs, :16;
- uint16_t ldt, :16;
- uint16_t trace, bitmap;
- };
-
-enum seg_system
- {
- SYS_SYSTEM = 0, /* System segment. */
- SYS_CODE_DATA = 1 /* Code or data segment. */
- };
-
-enum seg_granularity
- {
- GRAN_BYTE = 0, /* Limit has 1-byte granularity. */
- GRAN_PAGE = 1 /* Limit has 4 kB granularity. */
- };
-
-enum seg_type
- {
- /* System segment types. */
- TYPE_TSS_16_A = 1, /* 16-bit TSS (available). */
- TYPE_LDT = 2, /* LDT. */
- TYPE_TSS_16_B = 3, /* 16-bit TSS (busy). */
- TYPE_CALL_16 = 4, /* 16-bit call gate. */
- TYPE_TASK = 5, /* Task gate. */
- TYPE_INT_16 = 6, /* 16-bit interrupt gate. */
- TYPE_TRAP_16 = 7, /* 16-bit trap gate. */
- TYPE_TSS_32_A = 9, /* 32-bit TSS (available). */
- TYPE_TSS_32_B = 11, /* 32-bit TSS (busy). */
- TYPE_CALL_32 = 12, /* 32-bit call gate. */
- TYPE_INT_32 = 14, /* 32-bit interrupt gate. */
- TYPE_TRAP_32 = 15, /* 32-bit trap gate. */
-
- /* Code/data segment types. */
- TYPE_CODE = 8, /* 1=Code segment, 0=data segment. */
- TYPE_ACCESSED = 1, /* Set if accessed. */
-
- /* Data segment types. */
- TYPE_EXPAND_DOWN = 4, /* 1=Expands up, 0=expands down. */
- TYPE_WRITABLE = 2, /* 1=Read/write, 0=read-only. */
-
- /* Code segment types. */
- TYPE_CONFORMING = 4, /* 1=Conforming, 0=nonconforming. */
- TYPE_READABLE = 2 /* 1=Exec/read, 0=exec-only. */
- };
-
-static inline uint64_t
-make_dtr_operand (uint16_t limit, void *base)
-{
- return limit | ((uint64_t) (uint32_t) base << 16);
+\f
+/* Page directories and page tables.
+
+ For more information see [IA32-v3] pages 3-23 to 3-28.
+
+ PDEs and PTEs share a common format:
+
+ 32 12 0
+ +------------------------------------+------------------------+
+ | Physical Address | Flags |
+ +------------------------------------+------------------------+
+
+ In a PDE, the physical address points to a page table.
+ In a PTE, the physical address points to a data or code page.
+ The important flags are listed below.
+ When a PDE or PTE is not "present", the other flags are
+ ignored.
+ A PDE or PTE that is initialized to 0 will be interpreted as
+ "not present", which is just fine. */
+#define PG_P 0x1 /* 1=present, 0=not present. */
+#define PG_W 0x2 /* 1=read/write, 0=read-only. */
+#define PG_U 0x4 /* 1=user/kernel, 0=kernel only. */
+#define PG_A 0x20 /* 1=accessed, 0=not acccessed. */
+#define PG_D 0x40 /* 1=dirty, 0=not dirty (PTEs only). */
+
+/* Obtains page directory index from a virtual address. */
+static inline uintptr_t pd_no (const void *va) {
+ return (uintptr_t) va >> PDSHIFT;
+}
+
+/* Returns a PDE that points to page table PT. */
+static inline uint32_t pde_create (uint32_t *pt) {
+ ASSERT (pg_ofs (pt) == 0);
+ return vtop (pt) | PG_U | PG_P | PG_W;
+}
+
+/* Returns a pointer to the page table that page directory entry
+ PDE, which must "present", points to. */
+static inline uint32_t *pde_get_pt (uint32_t pde) {
+ ASSERT (pde & PG_P);
+ return ptov (pde & ~PGMASK);
+}
+
+/* Obtains page table index from a virtual address. */
+static inline unsigned pt_no (void *va) {
+ return ((uintptr_t) va & PTMASK) >> PTSHIFT;
+}
+
+/* Returns a PTE that points to PAGE.
+ The PTE's page is readable.
+ If WRITABLE is true then it will be writable as well.
+ The page will be usable only by ring 0 code (the kernel). */
+static inline uint32_t pte_create_kernel (uint32_t *page, bool writable) {
+ ASSERT (pg_ofs (page) == 0);
+ return vtop (page) | PG_P | (writable ? PG_W : 0);
+}
+
+/* Returns a PTE that points to PAGE.
+ The PTE's page is readable.
+ If WRITABLE is true then it will be writable as well.
+ The page will be usable by both user and kernel code. */
+static inline uint32_t pte_create_user (uint32_t *page, bool writable) {
+ return pte_create_kernel (page, writable) | PG_U;
+}
+
+/* Returns a pointer to the page that page table entry PTE, which
+ must be "present", points to. */
+static inline void *pte_get_page (uint32_t pte) {
+ ASSERT (pte & PG_P);
+ return ptov (pte & ~PGMASK);