summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrans Kaashoek <[email protected]>2018-09-23 08:24:42 -0400
committerFrans Kaashoek <[email protected]>2018-09-23 08:35:30 -0400
commitab0db651af6f1ffa8fe96909ce16ae314d65c3fb (patch)
treec429f8ee36fa7da1e25f564a160b031613ca05e9
parentb818915f793cd20c5d1e24f668534a9d690f3cc8 (diff)
downloadxv6-labs-ab0db651af6f1ffa8fe96909ce16ae314d65c3fb.tar.gz
xv6-labs-ab0db651af6f1ffa8fe96909ce16ae314d65c3fb.tar.bz2
xv6-labs-ab0db651af6f1ffa8fe96909ce16ae314d65c3fb.zip
Checkpoint port of xv6 to x86-64. Passed usertests on 2 processors a few times.
The x86-64 doesn't just add two levels to page tables to support 64 bit addresses, but is a different processor. For example, calling conventions, system calls, and segmentation are different from 32-bit x86. Segmentation is basically gone, but gs/fs in combination with MSRs can be used to hold a per-core pointer. In general, x86-64 is more straightforward than 32-bit x86. The port uses code from sv6 and the xv6 "rsc-amd64" branch. A summary of the changes is as follows: - Booting: switch to grub instead of xv6's bootloader (pass -kernel to qemu), because xv6's boot loader doesn't understand 64bit ELF files. And, we don't care anymore about booting. - Makefile: use -m64 instead of -m32 flag for gcc, delete boot loader, xv6.img, bochs, and memfs. For now dont' use -O2, since usertests with -O2 is bigger than MAXFILE! - Update gdb.tmpl to be for i386 or x86-64 - Console/printf: use stdarg.h and treat 64-bit addresses different from ints (32-bit) - Update elfhdr to be 64 bit - entry.S/entryother.S: add code to switch to 64-bit mode: build a simple page table in 32-bit mode before switching to 64-bit mode, share code for entering boot processor and APs, and tweak boot gdt. The boot gdt is the gdt that the kernel proper also uses. (In 64-bit mode, the gdt/segmentation and task state mostly disappear.) - exec.c: fix passing argv (64-bit now instead of 32-bit). - initcode.c: use syscall instead of int. - kernel.ld: load kernel very high, in top terabyte. 64 bits is a lot of address space! - proc.c: initial return is through new syscall path instead of trapret. - proc.h: update struct cpu to have some scratch space since syscall saves less state than int, update struct context to reflect x86-64 calling conventions. - swtch: simplify for x86-64 calling conventions. - syscall: add fetcharg to handle x86-64 calling convetions (6 arguments are passed through registers), and fetchaddr to read a 64-bit value from user space. - sysfile: update to handle pointers from user space (e.g., sys_exec), which are 64 bits. - trap.c: no special trap vector for sys calls, because x86-64 has a different plan for system calls. - trapasm: one plan for syscalls and one plan for traps (interrupt and exceptions). On x86-64, the kernel is responsible for switching user/kernel stacks. To do, xv6 keeps some scratch space in the cpu structure, and uses MSR GS_KERN_BASE to point to the core's cpu structure (using swapgs). - types.h: add uint64, and change pde_t to uint64 - usertests: exit() when fork fails, which helped in tracking down one of the bugs in the switch from 32-bit to 64-bit - vectors: update to make them 64 bits - vm.c: use bootgdt in kernel too, program MSRs for syscalls and core-local state (for swapgs), walk 4 levels in walkpgdir, add DEVSPACETOP, use task segment to set kernel stack for interrupts (but simpler than in 32-bit mode), add an extra argument to freevm (size of user part of address space) to avoid checking all entries till KERNBASE (there are MANY TB before the top 1TB). - x86: update trapframe to have 64-bit entries, which is what the processor pushes on syscalls and traps. simplify lgdt and lidt, using struct desctr, which needs the gcc directives packed and aligned. TODO: - use int32 instead of int? - simplify curproc(). xv6 has per-cpu state again, but this time it must have it. - avoid repetition in walkpgdir - fix validateint() in usertests.c - fix bugs (e.g., observed one a case of entering kernel with invalid gs or proc
-rw-r--r--.gdbinit.tmpl27
-rw-r--r--.gdbinit.tmpl-i3865
-rw-r--r--.gdbinit.tmpl-x6418
-rw-r--r--Makefile79
-rw-r--r--bootasm.S88
-rw-r--r--console.c30
-rw-r--r--defs.h12
-rw-r--r--elf.h22
-rw-r--r--entry.S273
-rw-r--r--entryother.S57
-rw-r--r--exec.c30
-rw-r--r--initcode.S13
-rw-r--r--ioapic.c3
-rw-r--r--kalloc.c6
-rw-r--r--kernel.ld41
-rw-r--r--main.c69
-rw-r--r--memlayout.h7
-rw-r--r--mmu.h236
-rw-r--r--mp.c6
-rw-r--r--mp.h8
-rw-r--r--msr.h25
-rw-r--r--printf.c34
-rw-r--r--proc.c34
-rw-r--r--proc.h21
-rw-r--r--spinlock.c10
-rw-r--r--spinlock.h2
-rw-r--r--string.c2
-rw-r--r--swtch.S36
-rw-r--r--syscall.c58
-rw-r--r--sysfile.c6
-rw-r--r--trap.c29
-rw-r--r--trapasm.S150
-rw-r--r--traps.h1
-rw-r--r--types.h8
-rw-r--r--usertests.c38
-rw-r--r--usys.S2
-rwxr-xr-xvectors.pl16
-rw-r--r--vm.c213
-rw-r--r--x86.h108
39 files changed, 1050 insertions, 773 deletions
diff --git a/.gdbinit.tmpl b/.gdbinit.tmpl
deleted file mode 100644
index f71681a..0000000
--- a/.gdbinit.tmpl
+++ /dev/null
@@ -1,27 +0,0 @@
-set $lastcs = -1
-
-define hook-stop
- # There doesn't seem to be a good way to detect if we're in 16- or
- # 32-bit mode, but in 32-bit mode we always run with CS == 8 in the
- # kernel and CS == 35 in user space
- if $cs == 8 || $cs == 35
- if $lastcs != 8 && $lastcs != 35
- set architecture i386
- end
- x/i $pc
- else
- if $lastcs == -1 || $lastcs == 8 || $lastcs == 35
- set architecture i8086
- end
- # Translate the segment:offset into a physical address
- printf "[%4x:%4x] ", $cs, $eip
- x/i $cs*16+$eip
- end
- set $lastcs = $cs
-end
-
-echo + target remote localhost:1234\n
-target remote localhost:1234
-
-echo + symbol-file kernel\n
-symbol-file kernel
diff --git a/.gdbinit.tmpl-i386 b/.gdbinit.tmpl-i386
new file mode 100644
index 0000000..f4f85d2
--- /dev/null
+++ b/.gdbinit.tmpl-i386
@@ -0,0 +1,5 @@
+python
+gdb.execute("target remote localhost:26000")
+gdb.execute("set architecture i386")
+gdb.execute("symbol-file kernel")
+gdb.execute("break *0x7c00")
diff --git a/.gdbinit.tmpl-x64 b/.gdbinit.tmpl-x64
new file mode 100644
index 0000000..9c120ff
--- /dev/null
+++ b/.gdbinit.tmpl-x64
@@ -0,0 +1,18 @@
+#if you would like to use gdb in 32bit mode, comment out lines 8 and 15, then uncomment
+#the lines after. Note this will only work properly until 64bit mode is enabled in entry.S
+
+python
+gdb.execute("set architecture i386:x86-64:intel")
+gdb.execute("target remote localhost:26000")
+gdb.execute("symbol-file kernel")
+gdb.execute("break start64")
+#gdb.execute("break *0x7c00")
+try:
+ gdb.execute("continue")
+except:
+ pass
+gdb.execute("disconnect")
+gdb.execute("set architecture i386:x86-64")
+#gdb.execute("set architecture i386")
+gdb.execute("target remote localhost:26000")
+gdb.execute("delete break 1")
diff --git a/Makefile b/Makefile
index 09d790c..b199842 100644
--- a/Makefile
+++ b/Makefile
@@ -51,7 +51,7 @@ TOOLPREFIX := $(shell if i386-jos-elf-objdump -i 2>&1 | grep '^elf32-i386$$' >/d
endif
# If the makefile can't find QEMU, specify its path here
-# QEMU = qemu-system-i386
+QEMU = qemu-system-x86_64
# Try to infer the correct QEMU
ifndef QEMU
@@ -76,11 +76,16 @@ AS = $(TOOLPREFIX)gas
LD = $(TOOLPREFIX)ld
OBJCOPY = $(TOOLPREFIX)objcopy
OBJDUMP = $(TOOLPREFIX)objdump
-CFLAGS = -fno-pic -static -fno-builtin -fno-strict-aliasing -O2 -Wall -MD -ggdb -m32 -Werror -fno-omit-frame-pointer
+
+XFLAGS = -m64 -mcmodel=large -ggdb
+# CFLAGS = -fno-pic -static -fno-builtin -fno-strict-aliasing -O2 -Wall -MD -ggdb -Werror -fno-omit-frame-pointer
+CFLAGS = -fno-pic -static -fno-builtin -fno-strict-aliasing -Wall -MD -ggdb -Werror -fno-omit-frame-pointer
+CFLAGS += -ffreestanding -fno-common -nostdlib $(XFLAGS)
CFLAGS += $(shell $(CC) -fno-stack-protector -E -x c /dev/null >/dev/null 2>&1 && echo -fno-stack-protector)
-ASFLAGS = -m32 -gdwarf-2 -Wa,-divide
+ASFLAGS = -gdwarf-2 -Wa,-divide $(XFLAGS)
# FreeBSD ld wants ``elf_i386_fbsd''
-LDFLAGS += -m $(shell $(LD) -V | grep elf_i386 2>/dev/null | head -n 1)
+LDFLAGS += -m $(shell $(LD) -V | grep elf_x86_64 2>/dev/null | head -n 1)
+LDFLAGS += -z max-page-size=4096
# Disable PIE when possible (for Ubuntu 16.10 toolchain)
ifneq ($(shell $(CC) -dumpspecs 2>/dev/null | grep -e '[^f]no-pie'),)
@@ -90,23 +95,10 @@ ifneq ($(shell $(CC) -dumpspecs 2>/dev/null | grep -e '[^f]nopie'),)
CFLAGS += -fno-pie -nopie
endif
-xv6.img: bootblock kernel
- dd if=/dev/zero of=xv6.img count=10000
- dd if=bootblock of=xv6.img conv=notrunc
- dd if=kernel of=xv6.img seek=1 conv=notrunc
-
-xv6memfs.img: bootblock kernelmemfs
- dd if=/dev/zero of=xv6memfs.img count=10000
- dd if=bootblock of=xv6memfs.img conv=notrunc
- dd if=kernelmemfs of=xv6memfs.img seek=1 conv=notrunc
-
-bootblock: bootasm.S bootmain.c
- $(CC) $(CFLAGS) -fno-pic -O -nostdinc -I. -c bootmain.c
- $(CC) $(CFLAGS) -fno-pic -nostdinc -I. -c bootasm.S
- $(LD) $(LDFLAGS) -N -e start -Ttext 0x7C00 -o bootblock.o bootasm.o bootmain.o
- $(OBJDUMP) -S bootblock.o > bootblock.asm
- $(OBJCOPY) -S -O binary -j .text bootblock.o bootblock
- ./sign.pl bootblock
+kernel: $(OBJS) entry.o entryother initcode kernel.ld
+ $(LD) $(LDFLAGS) -T kernel.ld -o kernel entry.o $(OBJS) -b binary initcode entryother
+ $(OBJDUMP) -S kernel > kernel.asm
+ $(OBJDUMP) -t kernel | sed '1,/SYMBOL TABLE/d; s/ .* / /; /^$$/d' > kernel.sym
entryother: entryother.S
$(CC) $(CFLAGS) -fno-pic -nostdinc -I. -c entryother.S
@@ -120,23 +112,6 @@ initcode: initcode.S
$(OBJCOPY) -S -O binary initcode.out initcode
$(OBJDUMP) -S initcode.o > initcode.asm
-kernel: $(OBJS) entry.o entryother initcode kernel.ld
- $(LD) $(LDFLAGS) -T kernel.ld -o kernel entry.o $(OBJS) -b binary initcode entryother
- $(OBJDUMP) -S kernel > kernel.asm
- $(OBJDUMP) -t kernel | sed '1,/SYMBOL TABLE/d; s/ .* / /; /^$$/d' > kernel.sym
-
-# kernelmemfs is a copy of kernel that maintains the
-# disk image in memory instead of writing to a disk.
-# This is not so useful for testing persistent storage or
-# exploring disk buffering implementations, but it is
-# great for testing the kernel on real hardware without
-# needing a scratch disk.
-MEMFSOBJS = $(filter-out ide.o,$(OBJS)) memide.o
-kernelmemfs: $(MEMFSOBJS) entry.o entryother initcode kernel.ld fs.img
- $(LD) $(LDFLAGS) -T kernel.ld -o kernelmemfs entry.o $(MEMFSOBJS) -b binary initcode entryother fs.img
- $(OBJDUMP) -S kernelmemfs > kernelmemfs.asm
- $(OBJDUMP) -t kernelmemfs | sed '1,/SYMBOL TABLE/d; s/ .* / /; /^$$/d' > kernelmemfs.sym
-
tags: $(OBJS) entryother.S _init
etags *.S *.c
@@ -190,8 +165,8 @@ fs.img: mkfs README $(UPROGS)
clean:
rm -f *.tex *.dvi *.idx *.aux *.log *.ind *.ilg \
*.o *.d *.asm *.sym vectors.S bootblock entryother \
- initcode initcode.out kernel xv6.img fs.img kernelmemfs \
- xv6memfs.img mkfs .gdbinit \
+ initcode initcode.out kernel fs.img kernelmemfs \
+ mkfs .gdbinit \
$(UPROGS)
# make a printout
@@ -204,12 +179,6 @@ xv6.pdf: $(PRINT)
print: xv6.pdf
-# run in emulators
-
-bochs : fs.img xv6.img
- if [ ! -e .bochsrc ]; then ln -s dot-bochsrc .bochsrc; fi
- bochs -q
-
# try to generate a unique GDB port
GDBPORT = $(shell expr `id -u` % 5000 + 25000)
# QEMU's gdb stub command line changed in 0.11
@@ -219,25 +188,21 @@ QEMUGDB = $(shell if $(QEMU) -help | grep -q '^-gdb'; \
ifndef CPUS
CPUS := 2
endif
-QEMUOPTS = -drive file=fs.img,index=1,media=disk,format=raw -drive file=xv6.img,index=0,media=disk,format=raw -smp $(CPUS) -m 512 $(QEMUEXTRA)
-
-qemu: fs.img xv6.img
+QEMUOPTS = -kernel kernel -drive file=fs.img,index=1,media=disk,format=raw -smp $(CPUS) -m 512 $(QEMUEXTRA)
+qemu: fs.img
$(QEMU) -serial mon:stdio $(QEMUOPTS)
-qemu-memfs: xv6memfs.img
- $(QEMU) -drive file=xv6memfs.img,index=0,media=disk,format=raw -smp $(CPUS) -m 256
-
-qemu-nox: fs.img xv6.img
+qemu-nox: fs.img kernel
$(QEMU) -nographic $(QEMUOPTS)
-.gdbinit: .gdbinit.tmpl
+.gdbinit: .gdbinit.tmpl-x64
sed "s/localhost:1234/localhost:$(GDBPORT)/" < $^ > $@
-qemu-gdb: fs.img xv6.img .gdbinit
+qemu-gdb: fs.img kernel .gdbinit
@echo "*** Now run 'gdb'." 1>&2
- $(QEMU) -serial mon:stdio $(QEMUOPTS) -S $(QEMUGDB)
+ $(QEMU) $(QEMUOPTS) -S $(QEMUGDB)
-qemu-nox-gdb: fs.img xv6.img .gdbinit
+qemu-nox-gdb: fs.img kernel .gdbinit
@echo "*** Now run 'gdb'." 1>&2
$(QEMU) -nographic $(QEMUOPTS) -S $(QEMUGDB)
diff --git a/bootasm.S b/bootasm.S
deleted file mode 100644
index 257867c..0000000
--- a/bootasm.S
+++ /dev/null
@@ -1,88 +0,0 @@
-#include "asm.h"
-#include "memlayout.h"
-#include "mmu.h"
-
-# Start the first CPU: switch to 32-bit protected mode, jump into C.
-# The BIOS loads this code from the first sector of the hard disk into
-# memory at physical address 0x7c00 and starts executing in real mode
-# with %cs=0 %ip=7c00.
-
-.code16 # Assemble for 16-bit mode
-.globl start
-start:
- cli # BIOS enabled interrupts; disable
-
- # Zero data segment registers DS, ES, and SS.
- xorw %ax,%ax # Set %ax to zero
- movw %ax,%ds # -> Data Segment
- movw %ax,%es # -> Extra Segment
- movw %ax,%ss # -> Stack Segment
-
- # Physical address line A20 is tied to zero so that the first PCs
- # with 2 MB would run software that assumed 1 MB. Undo that.
-seta20.1:
- inb $0x64,%al # Wait for not busy
- testb $0x2,%al
- jnz seta20.1
-
- movb $0xd1,%al # 0xd1 -> port 0x64
- outb %al,$0x64
-
-seta20.2:
- inb $0x64,%al # Wait for not busy
- testb $0x2,%al
- jnz seta20.2
-
- movb $0xdf,%al # 0xdf -> port 0x60
- outb %al,$0x60
-
- # Switch from real to protected mode. Use a bootstrap GDT that makes
- # virtual addresses map directly to physical addresses so that the
- # effective memory map doesn't change during the transition.
- lgdt gdtdesc
- movl %cr0, %eax
- orl $CR0_PE, %eax
- movl %eax, %cr0
-
-//PAGEBREAK!
- # Complete the transition to 32-bit protected mode by using a long jmp
- # to reload %cs and %eip. The segment descriptors are set up with no
- # translation, so that the mapping is still the identity mapping.
- ljmp $(SEG_KCODE<<3), $start32
-
-.code32 # Tell assembler to generate 32-bit code now.
-start32:
- # Set up the protected-mode data segment registers
- movw $(SEG_KDATA<<3), %ax # Our data segment selector
- movw %ax, %ds # -> DS: Data Segment
- movw %ax, %es # -> ES: Extra Segment
- movw %ax, %ss # -> SS: Stack Segment
- movw $0, %ax # Zero segments not ready for use
- movw %ax, %fs # -> FS
- movw %ax, %gs # -> GS
-
- # Set up the stack pointer and call into C.
- movl $start, %esp
- call bootmain
-
- # If bootmain returns (it shouldn't), trigger a Bochs
- # breakpoint if running under Bochs, then loop.
- movw $0x8a00, %ax # 0x8a00 -> port 0x8a00
- movw %ax, %dx
- outw %ax, %dx
- movw $0x8ae0, %ax # 0x8ae0 -> port 0x8a00
- outw %ax, %dx
-spin:
- jmp spin
-
-# Bootstrap GDT
-.p2align 2 # force 4 byte alignment
-gdt:
- SEG_NULLASM # null seg
- SEG_ASM(STA_X|STA_R, 0x0, 0xffffffff) # code seg
- SEG_ASM(STA_W, 0x0, 0xffffffff) # data seg
-
-gdtdesc:
- .word (gdtdesc - gdt - 1) # sizeof(gdt) - 1
- .long gdt # address gdt
-
diff --git a/console.c b/console.c
index a280d2b..9986a9c 100644
--- a/console.c
+++ b/console.c
@@ -2,6 +2,8 @@
// Input is from the keyboard or serial port.
// Output is written to the screen and serial port.
+#include <stdarg.h>
+
#include "types.h"
#include "defs.h"
#include "param.h"
@@ -24,10 +26,11 @@ static struct {
int locking;
} cons;
+static char digits[] = "0123456789abcdef";
+
static void
printint(int xx, int base, int sign)
{
- static char digits[] = "0123456789abcdef";
char buf[16];
int i;
uint x;
@@ -48,14 +51,25 @@ printint(int xx, int base, int sign)
while(--i >= 0)
consputc(buf[i]);
}
+
+static void
+printptr(uint64 x) {
+ int i;
+ consputc('0');
+ consputc('x');
+ for (i = 0; i < (sizeof(uint64) * 2); i++, x <<= 4)
+ consputc(digits[x >> (sizeof(uint64) * 8 - 4)]);
+}
+
+
//PAGEBREAK: 50
// Print to the console. only understands %d, %x, %p, %s.
void
cprintf(char *fmt, ...)
{
+ va_list ap;
int i, c, locking;
- uint *argp;
char *s;
locking = cons.locking;
@@ -65,7 +79,7 @@ cprintf(char *fmt, ...)
if (fmt == 0)
panic("null fmt");
- argp = (uint*)(void*)(&fmt + 1);
+ va_start(ap, fmt);
for(i = 0; (c = fmt[i] & 0xff) != 0; i++){
if(c != '%'){
consputc(c);
@@ -76,14 +90,16 @@ cprintf(char *fmt, ...)
break;
switch(c){
case 'd':
- printint(*argp++, 10, 1);
+ printint(va_arg(ap, int), 10, 1);
break;
case 'x':
+ printint(va_arg(ap, int), 16, 1);
+ break;
case 'p':
- printint(*argp++, 16, 0);
+ printptr(va_arg(ap, uint64));
break;
case 's':
- if((s = (char*)*argp++) == 0)
+ if((s = va_arg(ap, char*)) == 0)
s = "(null)";
for(; *s; s++)
consputc(*s);
@@ -107,7 +123,7 @@ void
panic(char *s)
{
int i;
- uint pcs[10];
+ uint64 pcs[10];
cli();
cons.locking = 0;
diff --git a/defs.h b/defs.h
index 82fb982..fd9ecb4 100644
--- a/defs.h
+++ b/defs.h
@@ -126,7 +126,7 @@ void swtch(struct context**, struct context*);
// spinlock.c
void acquire(struct spinlock*);
-void getcallerpcs(void*, uint*);
+void getcallerpcs(void*, uint64*);
int holding(struct spinlock*);
void initlock(struct spinlock*, char*);
void release(struct spinlock*);
@@ -152,8 +152,10 @@ char* strncpy(char*, const char*, int);
int argint(int, int*);
int argptr(int, char**, int);
int argstr(int, char**);
-int fetchint(uint, int*);
-int fetchstr(uint, char**);
+int argaddr(int, uint64 *);
+int fetchint(uint64, int*);
+int fetchstr(uint64, char**);
+int fetchaddr(uint64, uint64*);
void syscall(void);
// timer.c
@@ -176,8 +178,8 @@ void kvmalloc(void);
pde_t* setupkvm(void);
char* uva2ka(pde_t*, char*);
int allocuvm(pde_t*, uint, uint);
-int deallocuvm(pde_t*, uint, uint);
-void freevm(pde_t*);
+int deallocuvm(pde_t*, uint64, uint64);
+void freevm(pde_t*, uint64);
void inituvm(pde_t*, char*, uint);
int loaduvm(pde_t*, char*, struct inode*, uint, uint);
pde_t* copyuvm(pde_t*, uint);
diff --git a/elf.h b/elf.h
index d16c967..84555fa 100644
--- a/elf.h
+++ b/elf.h
@@ -9,9 +9,9 @@ struct elfhdr {
ushort type;
ushort machine;
uint version;
- uint entry;
- uint phoff;
- uint shoff;
+ uint64 entry;
+ uint64 phoff;
+ uint64 shoff;
uint flags;
ushort ehsize;
ushort phentsize;
@@ -23,14 +23,14 @@ struct elfhdr {
// Program section header
struct proghdr {
- uint type;
- uint off;
- uint vaddr;
- uint paddr;
- uint filesz;
- uint memsz;
- uint flags;
- uint align;
+ uint32 type;
+ uint32 flags;
+ uint64 off;
+ uint64 vaddr;
+ uint64 paddr;
+ uint64 filesz;
+ uint64 memsz;
+ uint64 align;
};
// Values for Proghdr type
diff --git a/entry.S b/entry.S
index bc79bab..88ad92b 100644
--- a/entry.S
+++ b/entry.S
@@ -1,68 +1,223 @@
-# The xv6 kernel starts executing in this file. This file is linked with
-# the kernel C code, so it can refer to kernel symbols such as main().
-# The boot block (bootasm.S and bootmain.c) jumps to entry below.
-
-# Multiboot header, for multiboot boot loaders like GNU Grub.
+# x86-64 bootstrap, assuming load by MultiBoot-compliant loader.
+# The MutliBoot specification is at:
# http://www.gnu.org/software/grub/manual/multiboot/multiboot.html
-#
-# Using GRUB 2, you can boot xv6 from a file stored in a
-# Linux file system by copying kernel or kernelmemfs to /boot
-# and then adding this menu entry:
-#
-# menuentry "xv6" {
-# insmod ext2
-# set root='(hd0,msdos1)'
-# set kernel='/boot/kernel'
-# echo "Loading ${kernel}..."
-# multiboot ${kernel} ${kernel}
-# boot
-# }
-
-#include "asm.h"
-#include "memlayout.h"
+# GRUB is a MultiBoot loader, as is qemu's -kernel option.
+
#include "mmu.h"
-#include "param.h"
+#include "memlayout.h"
+
+# STACK is the size of the bootstrap stack.
+#define STACK 8192
-# Multiboot header. Data to direct multiboot loader.
-.p2align 2
+# MultiBoot header.
+# http://www.gnu.org/software/grub/manual/multiboot/multiboot.html#Header-layout
+.align 4
.text
.globl multiboot_header
multiboot_header:
#define magic 0x1badb002
- #define flags 0
+ #define flags (1<<16 | 1<<0)
.long magic
.long flags
- .long (-magic-flags)
-
-# By convention, the _start symbol specifies the ELF entry point.
-# Since we haven't set up virtual memory yet, our entry point is
-# the physical address of 'entry'.
-.globl _start
-_start = V2P_WO(entry)
-
-# Entering xv6 on boot processor, with paging off.
-.globl entry
-entry:
- # Turn on page size extension for 4Mbyte pages
- movl %cr4, %eax
- orl $(CR4_PSE), %eax
- movl %eax, %cr4
- # Set page directory
- movl $(V2P_WO(entrypgdir)), %eax
- movl %eax, %cr3
- # Turn on paging.
- movl %cr0, %eax
- orl $(CR0_PG|CR0_WP), %eax
- movl %eax, %cr0
-
- # Set up the stack pointer.
- movl $(stack + KSTACKSIZE), %esp
-
- # Jump to main(), and switch to executing at
- # high addresses. The indirect call is needed because
- # the assembler produces a PC-relative instruction
- # for a direct jump.
- mov $main, %eax
- jmp *%eax
-
-.comm stack, KSTACKSIZE
+ .long (- magic - flags) # checksum
+ .long V2P_WO(multiboot_header) # header address
+ .long V2P_WO(multiboot_header) # load address
+ .long V2P_WO(edata) # load end address
+ .long V2P_WO(end) # bss end address
+ .long V2P_WO(start) # entry address
+
+# Entry point jumped to by boot loader. Running in 32-bit mode.
+# http://www.gnu.org/software/grub/manual/multiboot/multiboot.html#Machine-state
+#
+# EAX = 0x2badb002
+# EBX = address of multiboot information structure
+# CS = 32-bit read/execute code segment with identity map
+# DS, ES, FS, GS, SS = 32-bit read/write data segment with identity map
+# A20 gate = enabled
+# CR0 = PE set, PG clear
+# EFLAGS = VM clear, IF clear
+#
+.code32
+.globl start
+start:
+ # Tell BIOS to do "warm reboot" when we shut down.
+ movw $0x1234, 0x472
+
+ # Set up multiboot arguments for main.
+ movl %eax, %edi
+ movl %ebx, %esi
+
+ # Initialize stack.
+ movl $V2P_WO(stack+STACK), %esp
+
+ # Zero bss. QEMU's MultiBoot seems not to.
+ # It's possible that the header above is not right, but it looks right.
+ # %edi is holding multiboot argument, so save in another register.
+ # (The stack is in the bss.)
+ movl %edi, %edx
+ movl $V2P_WO(edata), %edi
+ movl $V2P_WO(end), %ecx
+ subl $V2P_WO(edata), %ecx
+ movl $0, %eax
+ cld
+ rep stosb
+ movl %edx, %edi
+
+ call loadgdt
+
+ # Enter new 32-bit code segment (already in 32-bit mode).
+ ljmp $KCSEG32, $V2P_WO(start32) // code32 segment selector
+
+start32:
+ # Initialize page table.
+ call initpagetables
+ call init32e
+
+ movl $V2P_WO(start64), %eax
+ # Enter 64-bit mode.
+ ljmp $KCSEG, $V2P_WO(tramp64) // code64 segment selector
+
+.code64
+start64:
+ # Load VA of stack
+ movabsq $(stack+STACK), %rsp
+ # Clear frame pointer for stack walks
+ movl $0, %ebp
+ # Call into C code.
+ call bpmain
+ # should not return from bpmain
+ jmp .
+
+.code32
+.global apstart
+apstart:
+ call loadgdt
+ ljmp $KCSEG32, $V2P_WO(apstart32) // code32 segment selector
+
+apstart32:
+ call init32e
+ movl $V2P_WO(apstart64), %eax
+ ljmp $KCSEG, $V2P_WO(tramp64) // code64 segment selector
+
+.code64
+apstart64:
+ # Remember (from bootothers), that our kernel stack pointer is
+ # at the top of our temporary stack.
+ popq %rax
+ movq %rax, %rsp
+ movq $0, %rbp
+ call apmain
+1: jmp 1b
+
+.code64
+tramp64:
+ # The linker thinks we are running at tramp64, but we're actually
+ # running at PADDR(tramp64), so use an explicit calculation to
+ # load and jump to the correct address. %rax should hold the
+ # physical address of the jmp target.
+ movq $KERNBASE, %r11
+ addq %r11, %rax
+ jmp *%rax
+
+# Initial stack
+.comm stack, STACK
+
+# Page tables. See section 4.5 of 253668.pdf.
+# We map the first GB of physical memory at 0 and at 1 TB (not GB) before
+# the end of virtual memory. At boot time we are using the mapping at 0
+# but during ordinary execution we use the high mapping.
+# The intent is that after bootstrap the kernel can expand this mapping
+# to cover all the available physical memory.
+# This would be easier if we could use the PS bit to create GB-sized entries
+# and skip the pdt table, but not all chips support it, and QEMU doesn't.
+.align 4096
+pml4:
+ .quad V2P_WO(pdpt) + PTE_P + PTE_W // present, read/write
+ .quad 0
+ .space 4096 - 2*16
+ .quad V2P_WO(pdpt) + PTE_P + PTE_W
+ .quad 0
+
+.align 4096
+pdpt:
+ .quad V2P_WO(pdt) + PTE_P + PTE_W
+ .space 4096 - 8
+
+.align 4096
+pdt:
+ // Filled in below.
+ .space 4096
+
+.code32
+initpagetables:
+ pushl %edi
+ pushl %ecx
+ pushl %eax
+
+ // Set up 64-bit entry in %edx:%eax.
+ // Base address 0, present, read/write, large page.
+ movl $(0 | PTE_P | PTE_W | PTE_PS), %eax
+ movl $0, %edx
+
+ // Fill in 512 entries at pdt.
+ movl $V2P_WO(pdt), %edi
+ movl $512, %ecx
+1:
+ // Write this 64-bit entry.
+ movl %eax, 0(%edi)
+ movl %edx, 4(%edi)
+ addl $8, %edi
+ // 64-bit add to prepare address for next entry.
+ // Because this is a large page entry, it covers 512 4k pages (2 MB).
+ add $(512*4096), %eax
+ adc $0, %edx
+ loop 1b
+
+ popl %eax
+ popl %ecx
+ popl %edi
+ ret
+
+# Initialize IA-32e mode. See section 9.8.5 of 253668.pdf.
+init32e:
+ # Set CR4.PAE and CR4.PSE = 1.
+ movl %cr4, %eax
+ orl $0x30, %eax
+ movl %eax, %cr4
+
+ # Load CR3 with physical base address of level 4 page table.
+ movl $V2P_WO(pml4), %eax
+ movl %eax, %cr3
+
+ # Enable IA-32e mode by setting IA32_EFER.LME = 1.
+ # Also turn on IA32_EFER.SCE (syscall enable).
+ movl $0xc0000080, %ecx
+ rdmsr
+ orl $0x101, %eax
+ wrmsr
+
+ # Enable paging by setting CR0.PG = 1.
+ movl %cr0, %eax
+ orl $0x80000000, %eax
+ movl %eax, %cr0
+ nop
+ nop
+
+ ret
+
+loadgdt:
+ subl $8, %esp
+ movl $V2P_WO(bootgdt), 4(%esp)
+ movw $(8*NSEGS-1), 2(%esp)
+ lgdt 2(%esp)
+ addl $8, %esp
+
+ movl $KDSEG, %eax // data segment selector
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %ss
+ movl $0, %eax // null segment selector
+ movw %ax, %fs
+ movw %ax, %gs
+
+ ret
diff --git a/entryother.S b/entryother.S
index a3b6dc2..3e502f3 100644
--- a/entryother.S
+++ b/entryother.S
@@ -13,11 +13,9 @@
#
# Startothers (in main.c) sends the STARTUPs one at a time.
# It copies this code (start) at 0x7000. It puts the address of
-# a newly allocated per-core stack in start-4,the address of the
-# place to jump to (mpenter) in start-8, and the physical address
+# a newly allocated per-core stack in start-12,the address of the
+# place to jump to (apstart32) in start-4, and the physical address
# of entrypgdir in start-12.
-#
-# This code combines elements of bootasm.S and entry.S.
.code16
.globl start
@@ -41,53 +39,22 @@ start:
# Complete the transition to 32-bit protected mode by using a long jmp
# to reload %cs and %eip. The segment descriptors are set up with no
# translation, so that the mapping is still the identity mapping.
- ljmpl $(SEG_KCODE<<3), $(start32)
+ ljmpl $(KCSEG32), $start32
-//PAGEBREAK!
-.code32 # Tell assembler to generate 32-bit code now.
+.code32
start32:
- # Set up the protected-mode data segment registers
- movw $(SEG_KDATA<<3), %ax # Our data segment selector
- movw %ax, %ds # -> DS: Data Segment
- movw %ax, %es # -> ES: Extra Segment
- movw %ax, %ss # -> SS: Stack Segment
- movw $0, %ax # Zero segments not ready for use
- movw %ax, %fs # -> FS
- movw %ax, %gs # -> GS
-
- # Turn on page size extension for 4Mbyte pages
- movl %cr4, %eax
- orl $(CR4_PSE), %eax
- movl %eax, %cr4
- # Use entrypgdir as our initial page table
- movl (start-12), %eax
- movl %eax, %cr3
- # Turn on paging.
- movl %cr0, %eax
- orl $(CR0_PE|CR0_PG|CR0_WP), %eax
- movl %eax, %cr0
+ movl $start-12, %esp
+ movl start-4, %ecx
+ jmp *%ecx
- # Switch to the stack allocated by startothers()
- movl (start-4), %esp
- # Call mpenter()
- call *(start-8)
-
- movw $0x8a00, %ax
- movw %ax, %dx
- outw %ax, %dx
- movw $0x8ae0, %ax
- outw %ax, %dx
-spin:
- jmp spin
-
-.p2align 2
+.align 4
gdt:
SEG_NULLASM
- SEG_ASM(STA_X|STA_R, 0, 0xffffffff)
- SEG_ASM(STA_W, 0, 0xffffffff)
-
+ SEG_ASM(0xa, 0, 0xffffffff)
+ SEG_ASM(0x2, 0, 0xffffffff)
+.align 16
gdtdesc:
- .word (gdtdesc - gdt - 1)
+ .word 0x17 # sizeof(gdt)-1
.long gdt
diff --git a/exec.c b/exec.c
index b40134f..b1a9229 100644
--- a/exec.c
+++ b/exec.c
@@ -4,6 +4,8 @@
#include "mmu.h"
#include "proc.h"
#include "defs.h"
+#include "traps.h"
+#include "msr.h"
#include "x86.h"
#include "elf.h"
@@ -12,18 +14,18 @@ exec(char *path, char **argv)
{
char *s, *last;
int i, off;
- uint argc, sz, sp, ustack[3+MAXARG+1];
+ uint64 argc, sz, sp, ustack[3+MAXARG+1];
struct elfhdr elf;
struct inode *ip;
struct proghdr ph;
pde_t *pgdir, *oldpgdir;
struct proc *curproc = myproc();
-
+ uint64 oldsz = curproc->sz;
+
begin_op();
if((ip = namei(path)) == 0){
end_op();
- cprintf("exec: fail\n");
return -1;
}
ilock(ip);
@@ -72,7 +74,7 @@ exec(char *path, char **argv)
for(argc = 0; argv[argc]; argc++) {
if(argc >= MAXARG)
goto bad;
- sp = (sp - (strlen(argv[argc]) + 1)) & ~3;
+ sp = (sp - (strlen(argv[argc]) + 1)) & ~(sizeof(uint64)-1);
if(copyout(pgdir, sp, argv[argc], strlen(argv[argc]) + 1) < 0)
goto bad;
ustack[3+argc] = sp;
@@ -81,10 +83,13 @@ exec(char *path, char **argv)
ustack[0] = 0xffffffff; // fake return PC
ustack[1] = argc;
- ustack[2] = sp - (argc+1)*4; // argv pointer
+ ustack[2] = sp - (argc+1)*sizeof(uint64); // argv pointer
+
+ curproc->tf->rdi = argc;
+ curproc->tf->rsi = sp - (argc+1)*sizeof(uint64);
- sp -= (3+argc+1) * 4;
- if(copyout(pgdir, sp, ustack, (3+argc+1)*4) < 0)
+ sp -= (3+argc+1) * sizeof(uint64);
+ if(copyout(pgdir, sp, ustack, (3+argc+1)*sizeof(uint64)) < 0)
goto bad;
// Save program name for debugging.
@@ -92,20 +97,21 @@ exec(char *path, char **argv)
if(*s == '/')
last = s+1;
safestrcpy(curproc->name, last, sizeof(curproc->name));
-
+
// Commit to the user image.
oldpgdir = curproc->pgdir;
curproc->pgdir = pgdir;
curproc->sz = sz;
- curproc->tf->eip = elf.entry; // main
- curproc->tf->esp = sp;
+ curproc->tf->rip = elf.entry; // main
+ curproc->tf->rcx = elf.entry;
+ curproc->tf->rsp = sp;
switchuvm(curproc);
- freevm(oldpgdir);
+ freevm(oldpgdir, oldsz);
return 0;
bad:
if(pgdir)
- freevm(pgdir);
+ freevm(pgdir, sz);
if(ip){
iunlockput(ip);
end_op();
diff --git a/initcode.S b/initcode.S
index 80ac5d8..e097394 100644
--- a/initcode.S
+++ b/initcode.S
@@ -8,16 +8,15 @@
# exec(init, argv)
.globl start
start:
- pushl $argv
- pushl $init
- pushl $0 // where caller pc would be
- movl $SYS_exec, %eax
- int $T_SYSCALL
+ mov $init, %rdi
+ mov $argv, %rsi
+ mov $SYS_exec, %rax
+ syscall
# for(;;) exit();
exit:
- movl $SYS_exit, %eax
- int $T_SYSCALL
+ mov $SYS_exit, %rax
+ syscall
jmp exit
# char init[] = "/init\0";
diff --git a/ioapic.c b/ioapic.c
index cb0f015..bbe5f9b 100644
--- a/ioapic.c
+++ b/ioapic.c
@@ -4,6 +4,7 @@
#include "types.h"
#include "defs.h"
+#include "memlayout.h"
#include "traps.h"
#define IOAPIC 0xFEC00000 // Default physical address of IO APIC
@@ -50,7 +51,7 @@ ioapicinit(void)
{
int i, id, maxintr;
- ioapic = (volatile struct ioapic*)IOAPIC;
+ ioapic = P2V((volatile struct ioapic*)IOAPIC);
maxintr = (ioapicread(REG_VER) >> 16) & 0xFF;
id = ioapicread(REG_ID) >> 24;
if(id != ioapicid)
diff --git a/kalloc.c b/kalloc.c
index 14cd4f4..fb939b7 100644
--- a/kalloc.c
+++ b/kalloc.c
@@ -47,7 +47,7 @@ void
freerange(void *vstart, void *vend)
{
char *p;
- p = (char*)PGROUNDUP((uint)vstart);
+ p = (char*)PGROUNDUP((uint64)vstart);
for(; p + PGSIZE <= (char*)vend; p += PGSIZE)
kfree(p);
}
@@ -61,7 +61,7 @@ kfree(char *v)
{
struct run *r;
- if((uint)v % PGSIZE || v < end || V2P(v) >= PHYSTOP)
+ if((uint64)v % PGSIZE || v < end || V2P(v) >= PHYSTOP)
panic("kfree");
// Fill with junk to catch dangling refs.
@@ -91,6 +91,8 @@ kalloc(void)
kmem.freelist = r->next;
if(kmem.use_lock)
release(&kmem.lock);
+ if(r != 0 && (uint64) r < KERNBASE)
+ panic("kalloc");
return (char*)r;
}
diff --git a/kernel.ld b/kernel.ld
index e24c860..e78fd38 100644
--- a/kernel.ld
+++ b/kernel.ld
@@ -1,22 +1,13 @@
-/* Simple linker script for the JOS kernel.
- See the GNU ld 'info' manual ("info ld") to learn the syntax. */
-
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(_start)
+OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
+OUTPUT_ARCH(i386:x86-64)
SECTIONS
{
- /* Link the kernel at this address: "." means the current address */
- /* Must be equal to KERNLINK */
- . = 0x80100000;
-
+ . = 0xFFFFFF0000100000;
+ PROVIDE(text = .);
.text : AT(0x100000) {
*(.text .stub .text.* .gnu.linkonce.t.*)
}
-
- PROVIDE(etext = .); /* Define the 'etext' symbol to this value */
-
.rodata : {
*(.rodata .rodata.* .gnu.linkonce.r.*)
}
@@ -38,31 +29,21 @@ SECTIONS
for this section */
}
- /* Adjust the address for the data segment to the next page */
. = ALIGN(0x1000);
- /* Conventionally, Unix linkers provide pseudo-symbols
- * etext, edata, and end, at the end of the text, data, and bss.
- * For the kernel mapping, we need the address at the beginning
- * of the data section, but that's not one of the conventional
- * symbols, because the convention started before there was a
- * read-only rodata section between text and data. */
- PROVIDE(data = .);
-
- /* The data segment */
+ /* Conventionally, Unix linkers provide pseudo-symbols
+ * etext, edata, and end, at the end of the text, data, and bss.
+ * For the kernel mapping, we need the address at the beginning
+ * of the data section, but that's not one of the conventional
+ * symbols, because the convention started before there was a
+ * read-only rodata section between text and data. */
+ PROVIDE(data = .);
.data : {
*(.data)
}
-
PROVIDE(edata = .);
-
.bss : {
*(.bss)
}
-
PROVIDE(end = .);
-
- /DISCARD/ : {
- *(.eh_frame .note.GNU-stack)
- }
}
diff --git a/main.c b/main.c
index 9924e64..449396a 100644
--- a/main.c
+++ b/main.c
@@ -6,17 +6,22 @@
#include "proc.h"
#include "x86.h"
-static void startothers(void);
-static void mpmain(void) __attribute__((noreturn));
extern pde_t *kpgdir;
extern char end[]; // first address after kernel loaded from ELF file
+static void main(void) __attribute__((noreturn));
+static void startothers(void);
+
+
// Bootstrap processor starts running C code here.
// Allocate a real stack and switch to it, first
// doing some setup required for memory allocator to work.
int
-main(void)
+bpmain(uint64 mbmagic, uint64 mbaddr)
{
+ if(mbmagic != 0x2badb002)
+ panic("multiboot header not found");
+
kinit1(end, P2V(4*1024*1024)); // phys page allocator
kvmalloc(); // kernel page table
mpinit(); // detect other processors
@@ -30,26 +35,19 @@ main(void)
tvinit(); // trap vectors
binit(); // buffer cache
fileinit(); // file table
- ideinit(); // disk
+ ideinit(); // disk
+
startothers(); // start other processors
+
kinit2(P2V(4*1024*1024), P2V(PHYSTOP)); // must come after startothers()
userinit(); // first user process
- mpmain(); // finish this processor's setup
-}
-
-// Other CPUs jump here from entryother.S.
-static void
-mpenter(void)
-{
- switchkvm();
- seginit();
- lapicinit();
- mpmain();
+ main();
+ return 0;
}
// Common CPU setup code.
static void
-mpmain(void)
+main(void)
{
cprintf("cpu%d: starting %d\n", cpuid(), cpuid());
idtinit(); // load idt register
@@ -57,7 +55,17 @@ mpmain(void)
scheduler(); // start running processes
}
-pde_t entrypgdir[]; // For entry.S
+// Other CPUs jump here from entryother.S.
+void
+apmain(void)
+{
+ switchkvm();
+ seginit();
+ lapicinit();
+ main();
+}
+
+void apstart(void);
// Start the non-boot (AP) processors.
static void
@@ -72,7 +80,7 @@ startothers(void)
// The linker has placed the image of entryother.S in
// _binary_entryother_start.
code = P2V(0x7000);
- memmove(code, _binary_entryother_start, (uint)_binary_entryother_size);
+ memmove(code, _binary_entryother_start, (uint64)_binary_entryother_size);
for(c = cpus; c < cpus+ncpu; c++){
if(c == mycpu()) // We've started already.
@@ -82,9 +90,8 @@ startothers(void)
// pgdir to use. We cannot use kpgdir yet, because the AP processor
// is running in low memory, so we use entrypgdir for the APs too.
stack = kalloc();
- *(void**)(code-4) = stack + KSTACKSIZE;
- *(void(**)(void))(code-8) = mpenter;
- *(int**)(code-12) = (void *) V2P(entrypgdir);
+ *(uint32*)(code-4) = V2P(apstart);
+ *(uint64*)(code-12) = (uint64) (stack+KSTACKSIZE);
lapicstartap(c->apicid, V2P(code));
@@ -94,23 +101,3 @@ startothers(void)
}
}
-// The boot page table used in entry.S and entryother.S.
-// Page directories (and page tables) must start on page boundaries,
-// hence the __aligned__ attribute.
-// PTE_PS in a page directory entry enables 4Mbyte pages.
-
-__attribute__((__aligned__(PGSIZE)))
-pde_t entrypgdir[NPDENTRIES] = {
- // Map VA's [0, 4MB) to PA's [0, 4MB)
- [0] = (0) | PTE_P | PTE_W | PTE_PS,
- // Map VA's [KERNBASE, KERNBASE+4MB) to PA's [0, 4MB)
- [KERNBASE>>PDXSHIFT] = (0) | PTE_P | PTE_W | PTE_PS,
-};
-
-//PAGEBREAK!
-// Blank page.
-//PAGEBREAK!
-// Blank page.
-//PAGEBREAK!
-// Blank page.
-
diff --git a/memlayout.h b/memlayout.h
index d1615f7..87818d3 100644
--- a/memlayout.h
+++ b/memlayout.h
@@ -2,13 +2,14 @@
#define EXTMEM 0x100000 // Start of extended memory
#define PHYSTOP 0xE000000 // Top physical memory
-#define DEVSPACE 0xFE000000 // Other devices are at high addresses
+#define DEVSPACE 0xFE000000 // Other devices are top of 32-bit address space
+#define DEVSPACETOP 0x100000000
// Key addresses for address space layout (see kmap in vm.c for layout)
-#define KERNBASE 0x80000000 // First kernel virtual address
+#define KERNBASE 0xFFFFFF0000000000 // First kernel virtual address
#define KERNLINK (KERNBASE+EXTMEM) // Address where kernel is linked
-#define V2P(a) (((uint) (a)) - KERNBASE)
+#define V2P(a) (((uint64) (a)) - KERNBASE)
#define P2V(a) ((void *)(((char *) (a)) + KERNBASE))
#define V2P_WO(x) ((x) - KERNBASE) // same as V2P, but without casts
diff --git a/mmu.h b/mmu.h
index a82d8e2..9450d90 100644
--- a/mmu.h
+++ b/mmu.h
@@ -2,8 +2,10 @@
// x86 memory management unit (MMU).
// Eflags register
+#define FL_TF 0x00000100 // Trap Flag
#define FL_IF 0x00000200 // Interrupt Enable
+
// Control Register flags
#define CR0_PE 0x00000001 // Protection Enable
#define CR0_WP 0x00010000 // Write Protect
@@ -11,81 +13,104 @@
#define CR4_PSE 0x00000010 // Page size extension
-// various segment selectors.
-#define SEG_KCODE 1 // kernel code
-#define SEG_KDATA 2 // kernel data+stack
-#define SEG_UCODE 3 // user code
-#define SEG_UDATA 4 // user data+stack
-#define SEG_TSS 5 // this process's task state
+// Segment selectors (indexes) in our GDTs.
+// Defined by our convention, not the architecture.
+#define KCSEG32 (1<<3) /* kernel 32-bit code segment */
+#define KCSEG (2<<3) /* kernel code segment */
+#define KDSEG (3<<3) /* kernel data segment */
+#define TSSSEG (4<<3) /* tss segment - takes two slots */
+#define UDSEG (6<<3) /* user data segment */
+#define UCSEG (7<<3) /* user code segment */
-// cpu->gdt[NSEGS] holds the above segments.
-#define NSEGS 6
+#define NSEGS 8
#ifndef __ASSEMBLER__
-// Segment Descriptor
struct segdesc {
- uint lim_15_0 : 16; // Low bits of segment limit
- uint base_15_0 : 16; // Low bits of segment base address
- uint base_23_16 : 8; // Middle bits of segment base address
- uint type : 4; // Segment type (see STS_ constants)
- uint s : 1; // 0 = system, 1 = application
- uint dpl : 2; // Descriptor Privilege Level
- uint p : 1; // Present
- uint lim_19_16 : 4; // High bits of segment limit
- uint avl : 1; // Unused (available for software use)
- uint rsv1 : 1; // Reserved
- uint db : 1; // 0 = 16-bit segment, 1 = 32-bit segment
- uint g : 1; // Granularity: limit scaled by 4K when set
- uint base_31_24 : 8; // High bits of segment base address
+ uint16 limit0;
+ uint16 base0;
+ uint8 base1;
+ uint8 bits;
+ uint8 bitslimit1;
+ uint8 base2;
};
-// Normal segment
-#define SEG(type, base, lim, dpl) (struct segdesc) \
-{ ((lim) >> 12) & 0xffff, (uint)(base) & 0xffff, \
- ((uint)(base) >> 16) & 0xff, type, 1, dpl, 1, \
- (uint)(lim) >> 28, 0, 0, 1, 1, (uint)(base) >> 24 }
-#define SEG16(type, base, lim, dpl) (struct segdesc) \
-{ (lim) & 0xffff, (uint)(base) & 0xffff, \
- ((uint)(base) >> 16) & 0xff, type, 1, dpl, 1, \
- (uint)(lim) >> 16, 0, 0, 1, 0, (uint)(base) >> 24 }
+// SEGDESC constructs a segment descriptor literal
+// with the given, base, limit, and type bits.
+#define SEGDESC(base, limit, bits) (struct segdesc){ \
+ (limit)&0xffff, (base)&0xffff, \
+ ((base)>>16)&0xff, \
+ (bits)&0xff, \
+ (((bits)>>4)&0xf0) | ((limit>>16)&0xf), \
+ ((base)>>24)&0xff, \
+}
+
+// SEGDESCHI constructs an extension segment descriptor
+// literal that records the high bits of base.
+#define SEGDESCHI(base) (struct segdesc) { \
+ (((base)>>32)&0xffff), (((base)>>48)&0xffff), \
+}
+
#endif
#define DPL_USER 0x3 // User DPL
+#define SEG_A (1<<0) /* segment accessed bit */
+#define SEG_R (1<<1) /* readable (code) */
+#define SEG_W (1<<1) /* writable (data) */
+#define SEG_C (1<<2) /* conforming segment (code) */
+#define SEG_E (1<<2) /* expand-down bit (data) */
+#define SEG_CODE (1<<3) /* code segment (instead of data) */
+
+// User and system segment bits.
+#define SEG_S (1<<4) /* if 0, system descriptor */
+#define SEG_DPL(x) ((x)<<5) /* descriptor privilege level (2 bits) */
+#define SEG_P (1<<7) /* segment present */
+#define SEG_AVL (1<<8) /* available for operating system use */
+#define SEG_L (1<<9) /* long mode */
+#define SEG_D (1<<10) /* default operation size 32-bit */
+#define SEG_G (1<<11) /* granularity */
+
// Application segment type bits
#define STA_X 0x8 // Executable segment
#define STA_W 0x2 // Writeable (non-executable segments)
#define STA_R 0x2 // Readable (executable segments)
// System segment type bits
-#define STS_T32A 0x9 // Available 32-bit TSS
-#define STS_IG32 0xE // 32-bit Interrupt Gate
-#define STS_TG32 0xF // 32-bit Trap Gate
-
-// A virtual address 'la' has a three-part structure as follows:
+#define SEG_LDT (2<<0) /* local descriptor table */
+#define SEG_TSS64A (9<<0) /* available 64-bit TSS */
+#define SEG_TSS64B (11<<0) /* busy 64-bit TSS */
+#define SEG_CALL64 (12<<0) /* 64-bit call gate */
+#define SEG_INTR64 (14<<0) /* 64-bit interrupt gate */
+#define SEG_TRAP64 (15<<0) /* 64-bit trap gate */
+
+// A virtual address 'la' has a six-part structure as follows:
//
-// +--------10------+-------10-------+---------12----------+
-// | Page Directory | Page Table | Offset within Page |
-// | Index | Index | |
-// +----------------+----------------+---------------------+
-// \--- PDX(va) --/ \--- PTX(va) --/
-
+// +--16--+---9---+------9-------+-----9----+----9-------+----12-------+
+// | Sign | PML4 |Page Directory| Page Dir |Page Table | Offset Page |
+// |Extend| Index | Pointer Index| Index | Index | in Page |
+// +------+-------+--------------+----------+------------+-------------+
+// \-PMX(va)-/\-PDPX(va)--/ \-PDX(va)-/ \-PTX(va)-/
+
+#define PMX(va) (((uint64)(va) >> PML4XSHIFT) & PXMASK)
+#define PDPX(va) (((uint64)(va) >> PDPXSHIFT) & PXMASK)
// page directory index
-#define PDX(va) (((uint)(va) >> PDXSHIFT) & 0x3FF)
-
+#define PDX(va) (((uint64)(va) >> PDXSHIFT) & PXMASK)
// page table index
-#define PTX(va) (((uint)(va) >> PTXSHIFT) & 0x3FF)
+#define PTX(va) (((uint64)(va) >> PTXSHIFT) & PXMASK)
// construct virtual address from indexes and offset
-#define PGADDR(d, t, o) ((uint)((d) << PDXSHIFT | (t) << PTXSHIFT | (o)))
+#define PGADDR(d, t, o) ((uint64)((d) << PDXSHIFT | (t) << PTXSHIFT | (o)))
// Page directory and page table constants.
-#define NPDENTRIES 1024 // # directory entries per page directory
-#define NPTENTRIES 1024 // # PTEs per page table
+#define NPDENTRIES 512 // # directory entries per page directory
+#define NPTENTRIES 512 // # PTEs per page table
#define PGSIZE 4096 // bytes mapped by a page
#define PTXSHIFT 12 // offset of PTX in a linear address
-#define PDXSHIFT 22 // offset of PDX in a linear address
+#define PDXSHIFT 21 // offset of PDX in a linear address
+#define PDPXSHIFT 30 // offset of PDPX in a linear address
+#define PML4XSHIFT 39 // offset of PML4X in a linear address
+#define PXMASK 0X1FF
#define PGROUNDUP(sz) (((sz)+PGSIZE-1) & ~(PGSIZE-1))
#define PGROUNDDOWN(a) (((a)) & ~(PGSIZE-1))
@@ -95,87 +120,54 @@ struct segdesc {
#define PTE_W 0x002 // Writeable
#define PTE_U 0x004 // User
#define PTE_PS 0x080 // Page Size
+#define PTE_PWT 0x008 // Write-Through
+#define PTE_PCD 0x010 // Cache-Disable
// Address in page table or page directory entry
-#define PTE_ADDR(pte) ((uint)(pte) & ~0xFFF)
-#define PTE_FLAGS(pte) ((uint)(pte) & 0xFFF)
+#define PTE_ADDR(pte) ((uint64)(pte) & ~0xFFF)
+#define PTE_FLAGS(pte) ((uint64)(pte) & 0xFFF)
#ifndef __ASSEMBLER__
-typedef uint pte_t;
-// Task state segment format
-struct taskstate {
- uint link; // Old ts selector
- uint esp0; // Stack pointers and segment selectors
- ushort ss0; // after an increase in privilege level
- ushort padding1;
- uint *esp1;
- ushort ss1;
- ushort padding2;
- uint *esp2;
- ushort ss2;
- ushort padding3;
- void *cr3; // Page directory base
- uint *eip; // Saved state from last task switch
- uint eflags;
- uint eax; // More saved state (registers)
- uint ecx;
- uint edx;
- uint ebx;
- uint *esp;
- uint *ebp;
- uint esi;
- uint edi;
- ushort es; // Even more saved state (segment selectors)
- ushort padding4;
- ushort cs;
- ushort padding5;
- ushort ss;
- ushort padding6;
- ushort ds;
- ushort padding7;
- ushort fs;
- ushort padding8;
- ushort gs;
- ushort padding9;
- ushort ldt;
- ushort padding10;
- ushort t; // Trap on task switch
- ushort iomb; // I/O map base address
-};
+typedef uint64 pml4e_t;
+typedef uint64 pdpe_t;
+typedef uint64 pte_t;
-// Gate descriptors for interrupts and traps
-struct gatedesc {
- uint off_15_0 : 16; // low 16 bits of offset in segment
- uint cs : 16; // code segment selector
- uint args : 5; // # args, 0 for interrupt/trap gates
- uint rsv1 : 3; // reserved(should be zero I guess)
- uint type : 4; // type(STS_{IG32,TG32})
- uint s : 1; // must be 0 (system)
- uint dpl : 2; // descriptor(meaning new) privilege level
- uint p : 1; // Present
- uint off_31_16 : 16; // high bits of offset in segment
+struct taskstate {
+ uint8 reserved0[4];
+ uint64 rsp[3];
+ uint64 ist[8];
+ uint8 reserved1[10];
+ uint16 iomba;
+ uint8 iopb[0];
+} __attribute__ ((packed));
+
+#define INT_P (1<<7) /* interrupt descriptor present */
+
+struct intgate
+{
+ uint16 rip0;
+ uint16 cs;
+ uint8 reserved0;
+ uint8 bits;
+ uint16 rip1;
+ uint32 rip2;
+ uint32 reserved1;
};
-// Set up a normal interrupt/trap gate descriptor.
-// - istrap: 1 for a trap (= exception) gate, 0 for an interrupt gate.
-// interrupt gate clears FL_IF, trap gate leaves FL_IF alone
-// - sel: Code segment selector for interrupt/trap handler
-// - off: Offset in code segment for interrupt/trap handler
-// - dpl: Descriptor Privilege Level -
-// the privilege level required for software to invoke
-// this interrupt/trap gate explicitly using an int instruction.
-#define SETGATE(gate, istrap, sel, off, d) \
-{ \
- (gate).off_15_0 = (uint)(off) & 0xffff; \
- (gate).cs = (sel); \
- (gate).args = 0; \
- (gate).rsv1 = 0; \
- (gate).type = (istrap) ? STS_TG32 : STS_IG32; \
- (gate).s = 0; \
- (gate).dpl = (d); \
- (gate).p = 1; \
- (gate).off_31_16 = (uint)(off) >> 16; \
+// INTDESC constructs an interrupt descriptor literal
+// that records the given code segment, instruction pointer,
+// and type bits.
+#define INTDESC(cs, rip, bits) (struct intgate){ \
+ (rip)&0xffff, (cs), 0, bits, ((rip)>>16)&0xffff, \
+ (uint64)(rip)>>32, 0, \
}
+// See section 4.6 of amd64 vol2
+struct desctr
+{
+ uint16 limit;
+ uint64 base;
+} __attribute__((packed, aligned(16))); // important!
+
#endif
diff --git a/mp.c b/mp.c
index 79bb0ad..e36e45c 100644
--- a/mp.c
+++ b/mp.c
@@ -28,7 +28,7 @@ sum(uchar *addr, int len)
// Look for an MP structure in the len bytes at addr.
static struct mp*
-mpsearch1(uint a, int len)
+mpsearch1(uint64 a, int len)
{
uchar *e, *p, *addr;
@@ -77,7 +77,7 @@ mpconfig(struct mp **pmp)
if((mp = mpsearch()) == 0 || mp->physaddr == 0)
return 0;
- conf = (struct mpconf*) P2V((uint) mp->physaddr);
+ conf = (struct mpconf*) P2V((uint64) mp->physaddr);
if(memcmp(conf, "PCMP", 4) != 0)
return 0;
if(conf->version != 1 && conf->version != 4)
@@ -101,7 +101,7 @@ mpinit(void)
if((conf = mpconfig(&mp)) == 0)
panic("Expect to run on an SMP");
ismp = 1;
- lapic = (uint*)conf->lapicaddr;
+ lapic = P2V((uint64)conf->lapicaddr_p);
for(p=(uchar*)(conf+1), e=(uchar*)conf+conf->length; p<e; ){
switch(*p){
case MPPROC:
diff --git a/mp.h b/mp.h
index 4d17283..5964b63 100644
--- a/mp.h
+++ b/mp.h
@@ -2,7 +2,7 @@
struct mp { // floating pointer
uchar signature[4]; // "_MP_"
- void *physaddr; // phys addr of MP config table
+ uint32 physaddr; // phys addr of MP config table
uchar length; // 1
uchar specrev; // [14]
uchar checksum; // all bytes must add up to 0
@@ -17,10 +17,10 @@ struct mpconf { // configuration table header
uchar version; // [14]
uchar checksum; // all bytes must add up to 0
uchar product[20]; // product id
- uint *oemtable; // OEM table pointer
+ uint32 oemtable; // OEM table pointer
ushort oemlength; // OEM table length
ushort entry; // entry count
- uint *lapicaddr; // address of local APIC
+ uint32 lapicaddr_p; // address of local APIC
ushort xlength; // extended table length
uchar xchecksum; // extended table checksum
uchar reserved;
@@ -42,7 +42,7 @@ struct mpioapic { // I/O APIC table entry
uchar apicno; // I/O APIC id
uchar version; // I/O APIC version
uchar flags; // I/O APIC flags
- uint *addr; // I/O APIC address
+ uint32 addr_p; // I/O APIC address
};
// Table entry types
diff --git a/msr.h b/msr.h
new file mode 100644
index 0000000..ad901a6
--- /dev/null
+++ b/msr.h
@@ -0,0 +1,25 @@
+// SYSCALL and SYSRET registers
+#define MSR_STAR 0xc0000081
+#define MSR_LSTAR 0xc0000082
+#define MSR_CSTAR 0xc0000083
+#define MSR_SFMASK 0xc0000084
+
+// GS
+#define MSR_GS_BASE 0xc0000101
+#define MSR_GS_KERNBASE 0xc0000102
+
+static inline uint64
+readmsr(uint32 msr)
+{
+ uint32 hi, lo;
+ __asm volatile("rdmsr" : "=d" (hi), "=a" (lo) : "c" (msr));
+ return ((uint64) lo) | (((uint64) hi) << 32);
+}
+
+static inline void
+writemsr(uint64 msr, uint64 val)
+{
+ uint32 lo = val & 0xffffffff;
+ uint32 hi = val >> 32;
+ __asm volatile("wrmsr" : : "c" (msr), "a" (lo), "d" (hi) : "memory");
+}
diff --git a/printf.c b/printf.c
index b3298aa..c820305 100644
--- a/printf.c
+++ b/printf.c
@@ -2,6 +2,10 @@
#include "stat.h"
#include "user.h"
+#include <stdarg.h>
+
+static char digits[] = "0123456789ABCDEF";
+
static void
putc(int fd, char c)
{
@@ -11,7 +15,6 @@ putc(int fd, char c)
static void
printint(int fd, int xx, int base, int sgn)
{
- static char digits[] = "0123456789ABCDEF";
char buf[16];
int i, neg;
uint x;
@@ -35,16 +38,25 @@ printint(int fd, int xx, int base, int sgn)
putc(fd, buf[i]);
}
+static void
+printptr(int fd, uint64 x) {
+ int i;
+ putc(fd, '0');
+ putc(fd, 'x');
+ for (i = 0; i < (sizeof(uint64) * 2); i++, x <<= 4)
+ putc(fd, digits[x >> (sizeof(uint64) * 8 - 4)]);
+}
+
// Print to the given fd. Only understands %d, %x, %p, %s.
void
printf(int fd, const char *fmt, ...)
{
+ va_list ap;
char *s;
int c, i, state;
- uint *ap;
+ va_start(ap, fmt);
state = 0;
- ap = (uint*)(void*)&fmt + 1;
for(i = 0; fmt[i]; i++){
c = fmt[i] & 0xff;
if(state == 0){
@@ -55,14 +67,13 @@ printf(int fd, const char *fmt, ...)
}
} else if(state == '%'){
if(c == 'd'){
- printint(fd, *ap, 10, 1);
- ap++;
- } else if(c == 'x' || c == 'p'){
- printint(fd, *ap, 16, 0);
- ap++;
+ printint(fd, va_arg(ap, int), 10, 1);
+ } else if(c == 'x') {
+ printint(fd, va_arg(ap, int), 16, 0);
+ } else if(c == 'p') {
+ printptr(fd, va_arg(ap, uint64));
} else if(c == 's'){
- s = (char*)*ap;
- ap++;
+ s = va_arg(ap, char*);
if(s == 0)
s = "(null)";
while(*s != 0){
@@ -70,8 +81,7 @@ printf(int fd, const char *fmt, ...)
s++;
}
} else if(c == 'c'){
- putc(fd, *ap);
- ap++;
+ putc(fd, va_arg(ap, uint));
} else if(c == '%'){
putc(fd, c);
} else {
diff --git a/proc.c b/proc.c
index 806b1b1..58ae948 100644
--- a/proc.c
+++ b/proc.c
@@ -6,6 +6,7 @@
#include "x86.h"
#include "proc.h"
#include "spinlock.h"
+#include "msr.h"
struct {
struct spinlock lock;
@@ -16,7 +17,7 @@ static struct proc *initproc;
int nextpid = 1;
extern void forkret(void);
-extern void trapret(void);
+extern void sysexit(void);
static void wakeup1(void *chan);
@@ -104,13 +105,13 @@ found:
// Set up new context to start executing at forkret,
// which returns to trapret.
- sp -= 4;
- *(uint*)sp = (uint)trapret;
+ sp -= sizeof(uint64);
+ *(uint64*)sp = (uint64)sysexit;
sp -= sizeof *p->context;
p->context = (struct context*)sp;
memset(p->context, 0, sizeof *p->context);
- p->context->eip = (uint)forkret;
+ p->context->eip = (uint64)forkret;
return p;
}
@@ -128,16 +129,12 @@ userinit(void)
initproc = p;
if((p->pgdir = setupkvm()) == 0)
panic("userinit: out of memory?");
- inituvm(p->pgdir, _binary_initcode_start, (int)_binary_initcode_size);
+ inituvm(p->pgdir, _binary_initcode_start, (uint64)_binary_initcode_size);
p->sz = PGSIZE;
memset(p->tf, 0, sizeof(*p->tf));
- p->tf->cs = (SEG_UCODE << 3) | DPL_USER;
- p->tf->ds = (SEG_UDATA << 3) | DPL_USER;
- p->tf->es = p->tf->ds;
- p->tf->ss = p->tf->ds;
- p->tf->eflags = FL_IF;
- p->tf->esp = PGSIZE;
- p->tf->eip = 0; // beginning of initcode.S
+ p->tf->r11 = FL_IF;
+ p->tf->rsp = PGSIZE;
+ p->tf->rcx = 0; // beginning of initcode.S
safestrcpy(p->name, "initcode", sizeof(p->name));
p->cwd = namei("/");
@@ -201,7 +198,7 @@ fork(void)
*np->tf = *curproc->tf;
// Clear %eax so that fork returns 0 in the child.
- np->tf->eax = 0;
+ np->tf->rax = 0;
for(i = 0; i < NOFILE; i++)
if(curproc->ofile[i])
@@ -289,8 +286,8 @@ wait(void)
pid = p->pid;
kfree(p->kstack);
p->kstack = 0;
- freevm(p->pgdir);
- p->pid = 0;
+ freevm(p->pgdir, p->sz);
+ p->pid = 0;
p->parent = 0;
p->name[0] = 0;
p->killed = 0;
@@ -339,6 +336,7 @@ scheduler(void)
// Switch to chosen process. It is the process's job
// to release ptable.lock and then reacquire it
// before jumping back to us.
+
c->proc = p;
switchuvm(p);
p->state = RUNNING;
@@ -408,7 +406,7 @@ forkret(void)
iinit(ROOTDEV);
initlog(ROOTDEV);
}
-
+
// Return to "caller", actually trapret (see allocproc).
}
@@ -514,7 +512,7 @@ procdump(void)
int i;
struct proc *p;
char *state;
- uint pc[10];
+ uint64 pc[10];
for(p = ptable.proc; p < &ptable.proc[NPROC]; p++){
if(p->state == UNUSED)
@@ -525,7 +523,7 @@ procdump(void)
state = "???";
cprintf("%d %s %s", p->pid, state, p->name);
if(p->state == SLEEPING){
- getcallerpcs((uint*)p->context->ebp+2, pc);
+ getcallerpcs((uint64*)p->context->ebp+2, pc);
for(i=0; i<10 && pc[i] != 0; i++)
cprintf(" %p", pc[i]);
}
diff --git a/proc.h b/proc.h
index 1647114..5ab2de5 100644
--- a/proc.h
+++ b/proc.h
@@ -1,5 +1,8 @@
// Per-CPU state
struct cpu {
+ uint64 syscallno; // Temporary used by sysentry
+ uint64 usp; // Temporary used by sysentry
+ struct proc *proc; // The process running on this cpu or null
uchar apicid; // Local APIC ID
struct context *scheduler; // swtch() here to enter scheduler
struct taskstate ts; // Used by x86 to find stack for interrupt
@@ -7,7 +10,6 @@ struct cpu {
volatile uint started; // Has the CPU started?
int ncli; // Depth of pushcli nesting.
int intena; // Were interrupts enabled before pushcli?
- struct proc *proc; // The process running on this cpu or null
};
extern struct cpu cpus[NCPU];
@@ -25,20 +27,23 @@ extern int ncpu;
// at the "Switch stacks" comment. Switch doesn't save eip explicitly,
// but it is on the stack and allocproc() manipulates it.
struct context {
- uint edi;
- uint esi;
- uint ebx;
- uint ebp;
- uint eip;
+ uint64 r15;
+ uint64 r14;
+ uint64 r13;
+ uint64 r12;
+ uint64 r11;
+ uint64 rbx;
+ uint64 ebp; //rbp
+ uint64 eip; //rip;
};
enum procstate { UNUSED, EMBRYO, SLEEPING, RUNNABLE, RUNNING, ZOMBIE };
// Per-process state
struct proc {
- uint sz; // Size of process memory (bytes)
+ char *kstack; // Bottom of kernel stack for this process, must be first entry
+ uint64 sz; // Size of process memory (bytes)
pde_t* pgdir; // Page table
- char *kstack; // Bottom of kernel stack for this process
enum procstate state; // Process state
int pid; // Process ID
struct proc *parent; // Parent process
diff --git a/spinlock.c b/spinlock.c
index 4020186..9ee65f6 100644
--- a/spinlock.c
+++ b/spinlock.c
@@ -69,17 +69,17 @@ release(struct spinlock *lk)
// Record the current call stack in pcs[] by following the %ebp chain.
void
-getcallerpcs(void *v, uint pcs[])
+getcallerpcs(void *v, uint64 pcs[])
{
- uint *ebp;
+ uint64 *ebp;
int i;
- ebp = (uint*)v - 2;
+ asm volatile("mov %%rbp, %0" : "=r" (ebp));
for(i = 0; i < 10; i++){
- if(ebp == 0 || ebp < (uint*)KERNBASE || ebp == (uint*)0xffffffff)
+ if(ebp == 0 || ebp < (uint64*)KERNBASE || ebp == (uint64*)0xffffffff)
break;
pcs[i] = ebp[1]; // saved %eip
- ebp = (uint*)ebp[0]; // saved %ebp
+ ebp = (uint64*)ebp[0]; // saved %ebp
}
for(; i < 10; i++)
pcs[i] = 0;
diff --git a/spinlock.h b/spinlock.h
index 0a9d8e2..90bffdb 100644
--- a/spinlock.h
+++ b/spinlock.h
@@ -5,7 +5,7 @@ struct spinlock {
// For debugging:
char *name; // Name of lock.
struct cpu *cpu; // The cpu holding the lock.
- uint pcs[10]; // The call stack (an array of program counters)
+ uint64 pcs[10]; // The call stack (an array of program counters)
// that locked the lock.
};
diff --git a/string.c b/string.c
index a7cc61f..861ea25 100644
--- a/string.c
+++ b/string.c
@@ -4,7 +4,7 @@
void*
memset(void *dst, int c, uint n)
{
- if ((int)dst%4 == 0 && n%4 == 0){
+ if ((uint64)dst%4 == 0 && n%4 == 0){
c &= 0xFF;
stosl(dst, (c<<24)|(c<<16)|(c<<8)|c, n/4);
} else
diff --git a/swtch.S b/swtch.S
index 63a7dcc..de2e79d 100644
--- a/swtch.S
+++ b/swtch.S
@@ -8,22 +8,28 @@
.globl swtch
swtch:
- movl 4(%esp), %eax
- movl 8(%esp), %edx
-
- # Save old callee-saved registers
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
+ # Save old callee-save registers
+ push %rbp
+ push %rbx
+ push %r11
+ push %r12
+ push %r13
+ push %r14
+ push %r15
# Switch stacks
- movl %esp, (%eax)
- movl %edx, %esp
+ mov %rsp, (%rdi) # first arg is in rdi
+ mov %rsi, %rsp # second arg is in rsi
+
+ # Load new callee-save registers
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %r11
+ pop %rbx
+ pop %rbp
- # Load new callee-saved registers
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
ret
+
+
diff --git a/syscall.c b/syscall.c
index ee85261..3ffe3d8 100644
--- a/syscall.c
+++ b/syscall.c
@@ -15,13 +15,13 @@
// Fetch the int at addr from the current process.
int
-fetchint(uint addr, int *ip)
+fetchint(uint64 addr, int *ip)
{
struct proc *curproc = myproc();
if(addr >= curproc->sz || addr+4 > curproc->sz)
return -1;
- *ip = *(int*)(addr);
+ *ip = *(uint64*)(addr);
return 0;
}
@@ -29,7 +29,7 @@ fetchint(uint addr, int *ip)
// Doesn't actually copy the string - just sets *pp to point at it.
// Returns length of string, not including nul.
int
-fetchstr(uint addr, char **pp)
+fetchstr(uint64 addr, char **pp)
{
char *s, *ep;
struct proc *curproc = myproc();
@@ -45,11 +45,51 @@ fetchstr(uint addr, char **pp)
return -1;
}
+static uint64
+fetcharg(int n)
+{
+ struct proc *curproc = myproc();
+ switch (n) {
+ case 0:
+ return curproc->tf->rdi;
+ case 1:
+ return curproc->tf->rsi;
+ case 2:
+ return curproc->tf->rdx;
+ case 3:
+ return curproc->tf->r10;
+ case 4:
+ return curproc->tf->r8;
+ case 5:
+ return curproc->tf->r9;
+ }
+ panic("fetcharg");
+ return -1;
+}
+
+int
+fetchaddr(uint64 addr, uint64 *ip)
+{
+ struct proc *curproc = myproc();
+ if(addr >= curproc->sz || addr+sizeof(uint64) > curproc->sz)
+ return -1;
+ *ip = *(uint64*)(addr);
+ return 0;
+}
+
// Fetch the nth 32-bit system call argument.
int
argint(int n, int *ip)
{
- return fetchint((myproc()->tf->esp) + 4 + 4*n, ip);
+ *ip = fetcharg(n);
+ return 0;
+}
+
+int
+argaddr(int n, uint64 *ip)
+{
+ *ip = fetcharg(n);
+ return 0;
}
// Fetch the nth word-sized system call argument as a pointer
@@ -58,10 +98,10 @@ argint(int n, int *ip)
int
argptr(int n, char **pp, int size)
{
- int i;
+ uint64 i;
struct proc *curproc = myproc();
- if(argint(n, &i) < 0)
+ if(argaddr(n, &i) < 0)
return -1;
if(size < 0 || (uint)i >= curproc->sz || (uint)i+size > curproc->sz)
return -1;
@@ -134,12 +174,12 @@ syscall(void)
int num;
struct proc *curproc = myproc();
- num = curproc->tf->eax;
+ num = curproc->tf->rax;
if(num > 0 && num < NELEM(syscalls) && syscalls[num]) {
- curproc->tf->eax = syscalls[num]();
+ curproc->tf->rax = syscalls[num]();
} else {
cprintf("%d %s: unknown sys call %d\n",
curproc->pid, curproc->name, num);
- curproc->tf->eax = -1;
+ curproc->tf->rax = -1;
}
}
diff --git a/sysfile.c b/sysfile.c
index 87e508b..d0de779 100644
--- a/sysfile.c
+++ b/sysfile.c
@@ -399,16 +399,16 @@ sys_exec(void)
{
char *path, *argv[MAXARG];
int i;
- uint uargv, uarg;
+ uint64 uargv, uarg;
- if(argstr(0, &path) < 0 || argint(1, (int*)&uargv) < 0){
+ if(argstr(0, &path) < 0 || argaddr(1, &uargv) < 0){
return -1;
}
memset(argv, 0, sizeof(argv));
for(i=0;; i++){
if(i >= NELEM(argv))
return -1;
- if(fetchint(uargv+4*i, (int*)&uarg) < 0)
+ if(fetchaddr(uargv+sizeof(uint64)*i, (uint64*)&uarg) < 0)
return -1;
if(uarg == 0){
argv[i] = 0;
diff --git a/trap.c b/trap.c
index 41c66eb..f27b99b 100644
--- a/trap.c
+++ b/trap.c
@@ -9,8 +9,8 @@
#include "spinlock.h"
// Interrupt descriptor table (shared by all CPUs).
-struct gatedesc idt[256];
-extern uint vectors[]; // in vectors.S: array of 256 entry pointers
+struct intgate idt[256];
+extern uint64 vectors[]; // in vectors.S: array of 256 entry pointers
struct spinlock tickslock;
uint ticks;
@@ -19,17 +19,22 @@ tvinit(void)
{
int i;
- for(i = 0; i < 256; i++)
- SETGATE(idt[i], 0, SEG_KCODE<<3, vectors[i], 0);
- SETGATE(idt[T_SYSCALL], 1, SEG_KCODE<<3, vectors[T_SYSCALL], DPL_USER);
-
+ for(i=0; i<256; i++) {
+ idt[i] = INTDESC(KCSEG, vectors[i], INT_P | SEG_INTR64);
+ }
+ idtinit();
+
initlock(&tickslock, "time");
}
void
idtinit(void)
{
- lidt(idt, sizeof(idt));
+ struct desctr dtr;
+
+ dtr.limit = sizeof(idt) - 1;
+ dtr.base = (uint64)idt;
+ lidt((void *)&dtr.limit);
}
//PAGEBREAK: 41
@@ -74,7 +79,7 @@ trap(struct trapframe *tf)
case T_IRQ0 + 7:
case T_IRQ0 + IRQ_SPURIOUS:
cprintf("cpu%d: spurious interrupt at %x:%x\n",
- cpuid(), tf->cs, tf->eip);
+ cpuid(), tf->cs, tf->rip);
lapiceoi();
break;
@@ -83,14 +88,14 @@ trap(struct trapframe *tf)
if(myproc() == 0 || (tf->cs&3) == 0){
// In kernel, it must be our mistake.
cprintf("unexpected trap %d from cpu %d eip %x (cr2=0x%x)\n",
- tf->trapno, cpuid(), tf->eip, rcr2());
+ tf->trapno, cpuid(), tf->rip, rcr2());
panic("trap");
}
// In user space, assume process misbehaved.
cprintf("pid %d %s: trap %d err %d on cpu %d "
"eip 0x%x addr 0x%x--kill proc\n",
myproc()->pid, myproc()->name, tf->trapno,
- tf->err, cpuid(), tf->eip, rcr2());
+ tf->err, cpuid(), tf->rip, rcr2());
myproc()->killed = 1;
}
@@ -105,8 +110,10 @@ trap(struct trapframe *tf)
if(myproc() && myproc()->state == RUNNING &&
tf->trapno == T_IRQ0+IRQ_TIMER)
yield();
-
+
// Check if the process has been killed since we yielded
if(myproc() && myproc()->killed && (tf->cs&3) == DPL_USER)
exit();
}
+
+
diff --git a/trapasm.S b/trapasm.S
index da8aefc..b6dbb1a 100644
--- a/trapasm.S
+++ b/trapasm.S
@@ -1,32 +1,136 @@
+#include "param.h"
+#include "x86.h"
#include "mmu.h"
-
- # vectors.S sends all traps here.
+
+# vectors.S sends all traps here.
.globl alltraps
alltraps:
# Build trap frame.
- pushl %ds
- pushl %es
- pushl %fs
- pushl %gs
- pushal
-
- # Set up data segments.
- movw $(SEG_KDATA<<3), %ax
- movw %ax, %ds
- movw %ax, %es
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+ push %r11
+ push %r10
+ push %r9
+ push %r8
+ push %rdi
+ push %rsi
+ push %rbp
+ push %rdx
+ push %rcx
+ push %rbx
+ push %rax
- # Call trap(tf), where tf=%esp
- pushl %esp
+ cmpw $KCSEG, 32(%rsp) # compare to saved cs
+ jz 1f
+ swapgs
+
+1:mov %rsp, %rdi # frame in arg1
call trap
- addl $4, %esp
- # Return falls through to trapret...
+# Return falls through to trapret...
.globl trapret
trapret:
- popal
- popl %gs
- popl %fs
- popl %es
- popl %ds
- addl $0x8, %esp # trapno and errcode
- iret
+ cli
+ cmpw $KCSEG, 32(%rsp) # compare to saved cs
+ jz 1f
+ swapgs
+
+1:pop %rax
+ pop %rbx
+ pop %rcx
+ pop %rdx
+ pop %rbp
+ pop %rsi
+ pop %rdi
+ pop %r8
+ pop %r9
+ pop %r10
+ pop %r11
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+
+ add $16, %rsp # discard trapnum and errorcode
+ iretq
+#PAGEBREAK!
+
+# syscall_entry jumps here after syscall instruction
+.globl sysentry
+sysentry: # Build trap frame.
+ // load kernel stack address
+ swapgs
+ movq %rax, %gs:0 // save %rax in syscallno of cpu entry
+ movq %rsp, %gs:8 // user sp
+ movq %gs:16, %rax // proc entry
+
+ movq %ss:0(%rax), %rax // load kstack from proc
+ addq $(KSTACKSIZE), %rax
+
+ movq %rax, %rsp
+ movq %gs:0, %rax // restore rax
+
+ // push usp
+ push $0
+ push %gs:8
+ // safe eflags and eip
+ push %r11
+ push $UCSEG
+ push %rcx
+ // push errno and trapno to make stack look like a trap
+ push $0
+ push $64
+
+ // push values on kernel stack
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+ push %r11
+ push %r10
+ push %r9
+ push %r8
+ push %rdi
+ push %rsi
+ push %rbp
+ push %rdx
+ push %rcx
+ push %rbx
+ push %rax
+
+ mov %rsp, %rdi # frame in arg1
+
+ call trap
+#PAGEBREAK!
+
+# Return falls through to trapret...
+.globl sysexit
+sysexit:
+ # to make sure we don't get any interrupts on the user stack while in
+ # supervisor mode. insufficient? (see vunerability reports for sysret)
+ cli
+
+ pop %rax
+ pop %rbx
+ pop %rcx
+ pop %rdx
+ pop %rbp
+ pop %rsi
+ pop %rdi
+ pop %r8
+ pop %r9
+ pop %r10
+ pop %r11
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+
+ add $(5*8), %rsp # discard trapnum, errorcode, rip, cs and rflags
+ mov (%rsp),%rsp # switch to the user stack
+ swapgs
+
+ sysretq
+
diff --git a/traps.h b/traps.h
index 0bd1fd8..cb6f8a7 100644
--- a/traps.h
+++ b/traps.h
@@ -36,3 +36,4 @@
#define IRQ_ERROR 19
#define IRQ_SPURIOUS 31
+
diff --git a/types.h b/types.h
index e4adf64..ee73164 100644
--- a/types.h
+++ b/types.h
@@ -1,4 +1,10 @@
typedef unsigned int uint;
typedef unsigned short ushort;
typedef unsigned char uchar;
-typedef uint pde_t;
+
+typedef unsigned char uint8;
+typedef unsigned short uint16;
+typedef unsigned int uint32;
+typedef unsigned long uint64;
+
+typedef uint64 pde_t;
diff --git a/usertests.c b/usertests.c
index a1e97e7..07d10d4 100644
--- a/usertests.c
+++ b/usertests.c
@@ -363,17 +363,29 @@ preempt(void)
printf(1, "preempt: ");
pid1 = fork();
+ if(pid1 < 0) {
+ printf(1, "fork failed");
+ exit();
+ }
if(pid1 == 0)
for(;;)
;
pid2 = fork();
+ if(pid2 < 0) {
+ printf(1, "fork failed\n");
+ exit();
+ }
if(pid2 == 0)
for(;;)
;
pipe(pfds);
pid3 = fork();
+ if(pid3 < 0) {
+ printf(1, "fork failed\n");
+ exit();
+ }
if(pid3 == 0){
close(pfds[0]);
if(write(pfds[1], "x", 1) != 1)
@@ -1391,6 +1403,11 @@ forktest(void)
exit();
}
+ if (n == 0) {
+ printf(1, "no fork at all!\n");
+ exit();
+ }
+
if(n == 1000){
printf(1, "fork claimed to work 1000 times!\n");
exit();
@@ -1414,16 +1431,16 @@ forktest(void)
void
sbrktest(void)
{
- int fds[2], pid, pids[10], ppid;
- char *a, *b, *c, *lastaddr, *oldbrk, *p, scratch;
- uint amt;
+ int i, fds[2], pids[10], pid, ppid;
+ char *c, *oldbrk, scratch, *a, *b, *lastaddr, *p;
+ uint64 amt;
+ #define BIG (100*1024*1024)
printf(stdout, "sbrk test\n");
oldbrk = sbrk(0);
// can one sbrk() less than a page?
a = sbrk(0);
- int i;
for(i = 0; i < 5000; i++){
b = sbrk(1);
if(b != a){
@@ -1449,9 +1466,8 @@ sbrktest(void)
wait();
// can one grow address space to something big?
-#define BIG (100*1024*1024)
a = sbrk(0);
- amt = (BIG) - (uint)a;
+ amt = (BIG) - (uint64)a;
p = sbrk(amt);
if (p != a) {
printf(stdout, "sbrk test failed to grow big address space; enough phys mem?\n");
@@ -1508,7 +1524,7 @@ sbrktest(void)
}
wait();
}
-
+
// if we run the system out of memory, does it clean up the last
// failed allocation?
if(pipe(fds) != 0){
@@ -1518,7 +1534,7 @@ sbrktest(void)
for(i = 0; i < sizeof(pids)/sizeof(pids[0]); i++){
if((pids[i] = fork()) == 0){
// allocate a lot of memory
- sbrk(BIG - (uint)sbrk(0));
+ sbrk(BIG - (uint64)sbrk(0));
write(fds[1], "x", 1);
// sit around until killed
for(;;) sleep(1000);
@@ -1526,6 +1542,7 @@ sbrktest(void)
if(pids[i] != -1)
read(fds[0], &scratch, 1);
}
+
// if those failed allocations freed up the pages they did allocate,
// we'll be able to allocate here
c = sbrk(4096);
@@ -1549,7 +1566,7 @@ sbrktest(void)
void
validateint(int *p)
{
- int res;
+ /* XXX int res;
asm("mov %%esp, %%ebx\n\t"
"mov %3, %%esp\n\t"
"int %2\n\t"
@@ -1557,13 +1574,14 @@ validateint(int *p)
"=a" (res) :
"a" (SYS_sleep), "n" (T_SYSCALL), "c" (p) :
"ebx");
+ */
}
void
validatetest(void)
{
int hi, pid;
- uint p;
+ uint64 p;
printf(stdout, "validate test\n");
hi = 1100*1024;
diff --git a/usys.S b/usys.S
index 8bfd8a1..e62f3d9 100644
--- a/usys.S
+++ b/usys.S
@@ -5,7 +5,7 @@
.globl name; \
name: \
movl $SYS_ ## name, %eax; \
- int $T_SYSCALL; \
+ syscall; \
ret
SYSCALL(fork)
diff --git a/vectors.pl b/vectors.pl
index 57b49dd..d746d6b 100755
--- a/vectors.pl
+++ b/vectors.pl
@@ -12,9 +12,9 @@ for(my $i = 0; $i < 256; $i++){
print ".globl vector$i\n";
print "vector$i:\n";
if(!($i == 8 || ($i >= 10 && $i <= 14) || $i == 17)){
- print " pushl \$0\n";
+ print " push \$0\n";
}
- print " pushl \$$i\n";
+ print " push \$$i\n";
print " jmp alltraps\n";
}
@@ -23,7 +23,7 @@ print ".data\n";
print ".globl vectors\n";
print "vectors:\n";
for(my $i = 0; $i < 256; $i++){
- print " .long vector$i\n";
+ print " .quad vector$i\n";
}
# sample output:
@@ -31,8 +31,8 @@ for(my $i = 0; $i < 256; $i++){
# .globl alltraps
# .globl vector0
# vector0:
-# pushl $0
-# pushl $0
+# push $0
+# push $0
# jmp alltraps
# ...
#
@@ -40,8 +40,8 @@ for(my $i = 0; $i < 256; $i++){
# .data
# .globl vectors
# vectors:
-# .long vector0
-# .long vector1
-# .long vector2
+# .quad vector0
+# .quad vector1
+# .quad vector2
# ...
diff --git a/vm.c b/vm.c
index 7134cff..fb0cc33 100644
--- a/vm.c
+++ b/vm.c
@@ -2,13 +2,34 @@
#include "types.h"
#include "defs.h"
#include "x86.h"
+#include "msr.h"
#include "memlayout.h"
#include "mmu.h"
#include "proc.h"
#include "elf.h"
+#include "traps.h"
extern char data[]; // defined by kernel.ld
-pde_t *kpgdir; // for use in scheduler()
+void sysentry(void);
+
+static pde_t *kpml4; // kernel address space, used by scheduler and bootup
+
+// Bootstrap GDT. Used by boot.S but defined in C
+// Map "logical" addresses to virtual addresses using identity map.
+// Cannot share a CODE descriptor for both kernel and user
+// because it would have to have DPL_USR, but the CPU forbids
+// an interrupt from CPL=0 to DPL=3.
+struct segdesc bootgdt[NSEGS] = {
+ [0] = SEGDESC(0, 0, 0), // null
+ [1] = SEGDESC(0, 0xfffff, SEG_R|SEG_CODE|SEG_S|SEG_DPL(0)|SEG_P|SEG_D|SEG_G), // 32-bit kernel code
+ [2] = SEGDESC(0, 0, SEG_R|SEG_CODE|SEG_S|SEG_DPL(0)|SEG_P|SEG_L|SEG_G), // 64-bit kernel code
+ [3] = SEGDESC(0, 0xfffff, SEG_W|SEG_S|SEG_DPL(0)|SEG_P|SEG_D|SEG_G), // kernel data
+ // The order of the user data and user code segments is
+ // important for syscall instructions. See initseg.
+ [6] = SEGDESC(0, 0xfffff, SEG_W|SEG_S|SEG_DPL(3)|SEG_P|SEG_D|SEG_G), // 64-bit user data
+ [7] = SEGDESC(0, 0, SEG_R|SEG_CODE|SEG_S|SEG_DPL(3)|SEG_P|SEG_L|SEG_G), // 64-bit user code
+};
+
// Set up CPU's kernel segment descriptors.
// Run once on entry on each CPU.
@@ -16,41 +37,82 @@ void
seginit(void)
{
struct cpu *c;
-
- // Map "logical" addresses to virtual addresses using identity map.
- // Cannot share a CODE descriptor for both kernel and user
- // because it would have to have DPL_USR, but the CPU forbids
- // an interrupt from CPL=0 to DPL=3.
- c = &cpus[cpuid()];
- c->gdt[SEG_KCODE] = SEG(STA_X|STA_R, 0, 0xffffffff, 0);
- c->gdt[SEG_KDATA] = SEG(STA_W, 0, 0xffffffff, 0);
- c->gdt[SEG_UCODE] = SEG(STA_X|STA_R, 0, 0xffffffff, DPL_USER);
- c->gdt[SEG_UDATA] = SEG(STA_W, 0, 0xffffffff, DPL_USER);
- lgdt(c->gdt, sizeof(c->gdt));
+ struct desctr dtr;
+
+ c = mycpu();
+ memmove(c->gdt, bootgdt, sizeof bootgdt);
+ dtr.limit = sizeof(c->gdt)-1;
+ dtr.base = (uint64) c->gdt;
+ lgdt((void *)&dtr.limit);
+
+ // When executing a syscall instruction the CPU sets the SS selector
+ // to (star >> 32) + 8 and the CS selector to (star >> 32).
+ // When executing a sysret instruction the CPU sets the SS selector
+ // to (star >> 48) + 8 and the CS selector to (star >> 48) + 16.
+ uint64 star = ((((uint64)UCSEG|0x3)- 16)<<48)|((uint64)(KCSEG)<<32);
+ writemsr(MSR_STAR, star);
+ writemsr(MSR_LSTAR, (uint64)&sysentry);
+ writemsr(MSR_SFMASK, FL_TF | FL_IF);
+
+ // Initialize cpu-local storage.
+ writegs(KDSEG);
+ writemsr(MSR_GS_BASE, (uint64)c);
+ writemsr(MSR_GS_KERNBASE, (uint64)c);
}
// Return the address of the PTE in page table pgdir
// that corresponds to virtual address va. If alloc!=0,
// create any required page table pages.
static pte_t *
-walkpgdir(pde_t *pgdir, const void *va, int alloc)
+walkpgdir(pde_t *pml4, const void *va, int alloc)
{
+ pml4e_t *pml4e;
+ pdpe_t *pdp;
+ pdpe_t *pdpe;
pde_t *pde;
+ pde_t *pd;
pte_t *pgtab;
- pde = &pgdir[PDX(va)];
- if(*pde & PTE_P){
- pgtab = (pte_t*)P2V(PTE_ADDR(*pde));
- } else {
- if(!alloc || (pgtab = (pte_t*)kalloc()) == 0)
+ // level 4
+ pml4e = &pml4[PMX(va)];
+ if(*pml4e & PTE_P)
+ pdp = (pdpe_t*)P2V(PTE_ADDR(*pml4e));
+ else {
+ if(!alloc || (pdp = (pdpe_t*)kalloc()) == 0)
return 0;
// Make sure all those PTE_P bits are zero.
- memset(pgtab, 0, PGSIZE);
+ memset(pdp, 0, PGSIZE);
// The permissions here are overly generous, but they can
// be further restricted by the permissions in the page table
// entries, if necessary.
+ *pml4e = V2P(pdp) | PTE_P | PTE_W | PTE_U;
+ }
+
+ // XXX avoid repetition
+
+ // level 3
+ pdpe = &pdp[PDPX(va)];
+ if(*pdpe & PTE_P)
+ pd = (pde_t*)P2V(PTE_ADDR(*pdpe));
+ else {
+ if(!alloc || (pd = (pde_t*)kalloc()) == 0)
+ return 0;
+ memset(pd, 0, PGSIZE);
+ *pdpe = V2P(pd) | PTE_P | PTE_W | PTE_U;
+ }
+
+ // level 2
+ pde = &pd[PDX(va)];
+ if(*pde & PTE_P)
+ pgtab = (pte_t*)P2V(PTE_ADDR(*pde));
+ else {
+ if(!alloc || (pgtab = (pte_t*)kalloc()) == 0)
+ return 0;
+ memset(pgtab, 0, PGSIZE);
*pde = V2P(pgtab) | PTE_P | PTE_W | PTE_U;
}
+
+ // level 1
return &pgtab[PTX(va)];
}
@@ -58,13 +120,13 @@ walkpgdir(pde_t *pgdir, const void *va, int alloc)
// physical addresses starting at pa. va and size might not
// be page-aligned.
static int
-mappages(pde_t *pgdir, void *va, uint size, uint pa, int perm)
+mappages(pde_t *pgdir, void *va, uint64 size, uint64 pa, int perm)
{
char *a, *last;
pte_t *pte;
- a = (char*)PGROUNDDOWN((uint)va);
- last = (char*)PGROUNDDOWN(((uint)va) + size - 1);
+ a = (char*)PGROUNDDOWN((uint64)va);
+ last = (char*)PGROUNDDOWN(((uint64)va) + size - 1);
for(;;){
if((pte = walkpgdir(pgdir, a, 1)) == 0)
return -1;
@@ -80,7 +142,7 @@ mappages(pde_t *pgdir, void *va, uint size, uint pa, int perm)
}
// There is one page table per process, plus one that's used when
-// a CPU is not running any process (kpgdir). The kernel uses the
+// a CPU is not running any process (kpml4). The kernel uses the
// current process's page table during system calls and interrupts;
// page protection bits prevent user code from using the kernel's
// mappings.
@@ -104,35 +166,36 @@ mappages(pde_t *pgdir, void *va, uint size, uint pa, int perm)
// every process's page table.
static struct kmap {
void *virt;
- uint phys_start;
- uint phys_end;
+ uint64 phys_start;
+ uint64 phys_end;
int perm;
} kmap[] = {
{ (void*)KERNBASE, 0, EXTMEM, PTE_W}, // I/O space
{ (void*)KERNLINK, V2P(KERNLINK), V2P(data), 0}, // kern text+rodata
{ (void*)data, V2P(data), PHYSTOP, PTE_W}, // kern data+memory
- { (void*)DEVSPACE, DEVSPACE, 0, PTE_W}, // more devices
+ { (void*)P2V(DEVSPACE), DEVSPACE, DEVSPACETOP, PTE_W}, // more devices
};
// Set up kernel part of a page table.
pde_t*
setupkvm(void)
{
- pde_t *pgdir;
+ pde_t *pml4;
struct kmap *k;
- if((pgdir = (pde_t*)kalloc()) == 0)
+ if((pml4 = (pde_t*)kalloc()) == 0)
return 0;
- memset(pgdir, 0, PGSIZE);
- if (P2V(PHYSTOP) > (void*)DEVSPACE)
+ memset(pml4, 0, PGSIZE);
+ if (PHYSTOP > DEVSPACE)
panic("PHYSTOP too high");
- for(k = kmap; k < &kmap[NELEM(kmap)]; k++)
- if(mappages(pgdir, k->virt, k->phys_end - k->phys_start,
+ for(k = kmap; k < &kmap[NELEM(kmap)]; k++) {
+ if(mappages(pml4, k->virt, k->phys_end - k->phys_start,
(uint)k->phys_start, k->perm) < 0) {
- freevm(pgdir);
+ freevm(pml4, 0);
return 0;
}
- return pgdir;
+ }
+ return pml4;
}
// Allocate one page table for the machine for the kernel address
@@ -140,7 +203,7 @@ setupkvm(void)
void
kvmalloc(void)
{
- kpgdir = setupkvm();
+ kpml4 = setupkvm();
switchkvm();
}
@@ -149,13 +212,17 @@ kvmalloc(void)
void
switchkvm(void)
{
- lcr3(V2P(kpgdir)); // switch to the kernel page table
+ lcr3(V2P(kpml4)); // switch to the kernel page table
}
+
// Switch TSS and h/w page table to correspond to process p.
void
switchuvm(struct proc *p)
{
+ struct desctr dtr;
+ struct cpu *c;
+
if(p == 0)
panic("switchuvm: no process");
if(p->kstack == 0)
@@ -164,16 +231,22 @@ switchuvm(struct proc *p)
panic("switchuvm: no pgdir");
pushcli();
- mycpu()->gdt[SEG_TSS] = SEG16(STS_T32A, &mycpu()->ts,
- sizeof(mycpu()->ts)-1, 0);
- mycpu()->gdt[SEG_TSS].s = 0;
- mycpu()->ts.ss0 = SEG_KDATA << 3;
- mycpu()->ts.esp0 = (uint)p->kstack + KSTACKSIZE;
- // setting IOPL=0 in eflags *and* iomb beyond the tss segment limit
- // forbids I/O instructions (e.g., inb and outb) from user space
- mycpu()->ts.iomb = (ushort) 0xFFFF;
- ltr(SEG_TSS << 3);
+
+ c = mycpu();
+ uint64 base = (uint64) &(c->ts);
+ c->gdt[TSSSEG>>3] = SEGDESC(base, (sizeof(c->ts)-1), SEG_P|SEG_TSS64A);
+ c->gdt[(TSSSEG>>3)+1] = SEGDESCHI(base);
+ c->ts.rsp[0] = (uint64) p->kstack + KSTACKSIZE;
+ c->ts.iomba = (ushort) 0xFFFF;
+
+ dtr.limit = sizeof(c->gdt) - 1;
+ dtr.base = (uint64)c->gdt;
+ lgdt((void *)&dtr.limit);
+
+ ltr(TSSSEG);
+
lcr3(V2P(p->pgdir)); // switch to process's address space
+
popcli();
}
@@ -197,10 +270,11 @@ inituvm(pde_t *pgdir, char *init, uint sz)
int
loaduvm(pde_t *pgdir, char *addr, struct inode *ip, uint offset, uint sz)
{
- uint i, pa, n;
+ uint i, n;
+ uint64 pa;
pte_t *pte;
- if((uint) addr % PGSIZE != 0)
+ if((uint64) addr % PGSIZE != 0)
panic("loaduvm: addr must be page aligned");
for(i = 0; i < sz; i += PGSIZE){
if((pte = walkpgdir(pgdir, addr+i, 0)) == 0)
@@ -222,7 +296,7 @@ int
allocuvm(pde_t *pgdir, uint oldsz, uint newsz)
{
char *mem;
- uint a;
+ uint64 a;
if(newsz >= KERNBASE)
return 0;
@@ -233,13 +307,11 @@ allocuvm(pde_t *pgdir, uint oldsz, uint newsz)
for(; a < newsz; a += PGSIZE){
mem = kalloc();
if(mem == 0){
- cprintf("allocuvm out of memory\n");
deallocuvm(pgdir, newsz, oldsz);
return 0;
}
memset(mem, 0, PGSIZE);
if(mappages(pgdir, (char*)a, PGSIZE, V2P(mem), PTE_W|PTE_U) < 0){
- cprintf("allocuvm out of memory (2)\n");
deallocuvm(pgdir, newsz, oldsz);
kfree(mem);
return 0;
@@ -253,10 +325,10 @@ allocuvm(pde_t *pgdir, uint oldsz, uint newsz)
// need to be less than oldsz. oldsz can be larger than the actual
// process size. Returns the new process size.
int
-deallocuvm(pde_t *pgdir, uint oldsz, uint newsz)
+deallocuvm(pde_t *pgdir, uint64 oldsz, uint64 newsz)
{
pte_t *pte;
- uint a, pa;
+ uint64 a, pa;
if(newsz >= oldsz)
return oldsz;
@@ -281,20 +353,34 @@ deallocuvm(pde_t *pgdir, uint oldsz, uint newsz)
// Free a page table and all the physical memory pages
// in the user part.
void
-freevm(pde_t *pgdir)
+freevm(pde_t *pml4, uint64 sz)
{
- uint i;
+ uint i, j, k;
+ pde_t *pdp, *pd, *pt;
- if(pgdir == 0)
+ if(pml4 == 0)
panic("freevm: no pgdir");
- deallocuvm(pgdir, KERNBASE, 0);
+
+ deallocuvm(pml4, sz, 0);
for(i = 0; i < NPDENTRIES; i++){
- if(pgdir[i] & PTE_P){
- char * v = P2V(PTE_ADDR(pgdir[i]));
- kfree(v);
+ if(pml4[i] & PTE_P){
+ pdp = (pdpe_t*)P2V(PTE_ADDR(pml4[i]));
+ for(j = 0; j < NPDENTRIES; j++){
+ if(pdp[j] & PTE_P){
+ pd = (pde_t*)P2V(PTE_ADDR(pdp[j]));
+ for(k = 0; k < NPDENTRIES; k++){
+ if(pd[k] & PTE_P) {
+ pt = (pde_t*)P2V(PTE_ADDR(pd[k]));
+ kfree((char*)pt);
+ }
+ }
+ kfree((char*)pd);
+ }
+ }
+ kfree((char*)pdp);
}
}
- kfree((char*)pgdir);
+ kfree((char*)pml4);
}
// Clear PTE_U on a page. Used to create an inaccessible
@@ -317,7 +403,8 @@ copyuvm(pde_t *pgdir, uint sz)
{
pde_t *d;
pte_t *pte;
- uint pa, i, flags;
+ uint64 pa, i;
+ uint flags;
char *mem;
if((d = setupkvm()) == 0)
@@ -340,7 +427,7 @@ copyuvm(pde_t *pgdir, uint sz)
return d;
bad:
- freevm(d);
+ freevm(d, sz);
return 0;
}
@@ -366,7 +453,7 @@ int
copyout(pde_t *pgdir, uint va, void *p, uint len)
{
char *buf, *pa0;
- uint n, va0;
+ uint64 n, va0;
buf = (char*)p;
while(len > 0){
diff --git a/x86.h b/x86.h
index 07312a5..17bec0d 100644
--- a/x86.h
+++ b/x86.h
@@ -1,5 +1,7 @@
// Routines to let C code use special x86 instructions.
+#ifndef __ASSEMBLER__
+
static inline uchar
inb(ushort port)
{
@@ -57,32 +59,16 @@ stosl(void *addr, int data, int cnt)
"memory", "cc");
}
-struct segdesc;
-
static inline void
-lgdt(struct segdesc *p, int size)
+lgdt(void *p)
{
- volatile ushort pd[3];
-
- pd[0] = size-1;
- pd[1] = (uint)p;
- pd[2] = (uint)p >> 16;
-
- asm volatile("lgdt (%0)" : : "r" (pd));
+ asm volatile("lgdt (%0)" : : "r" (p) : "memory");
}
-struct gatedesc;
-
static inline void
-lidt(struct gatedesc *p, int size)
+lidt(void *p)
{
- volatile ushort pd[3];
-
- pd[0] = size-1;
- pd[1] = (uint)p;
- pd[2] = (uint)p >> 16;
-
- asm volatile("lidt (%0)" : : "r" (pd));
+ asm volatile("lidt (%0)" : : "r" (p) : "memory");
}
static inline void
@@ -91,11 +77,11 @@ ltr(ushort sel)
asm volatile("ltr %0" : : "r" (sel));
}
-static inline uint
+static inline uint64
readeflags(void)
{
- uint eflags;
- asm volatile("pushfl; popl %0" : "=r" (eflags));
+ uint64 eflags;
+ asm volatile("pushf; pop %0" : "=r" (eflags));
return eflags;
}
@@ -133,51 +119,53 @@ xchg(volatile uint *addr, uint newval)
static inline uint
rcr2(void)
{
- uint val;
- asm volatile("movl %%cr2,%0" : "=r" (val));
+ uint64 val;
+ asm volatile("mov %%cr2,%0" : "=r" (val));
return val;
}
static inline void
-lcr3(uint val)
+lcr3(uint64 val)
+{
+ asm volatile("mov %0,%%cr3" : : "r" (val));
+}
+
+static inline void
+writegs(uint16 v)
{
- asm volatile("movl %0,%%cr3" : : "r" (val));
+ __asm volatile("movw %0, %%gs" : : "r" (v));
}
+
//PAGEBREAK: 36
// Layout of the trap frame built on the stack by the
// hardware and by trapasm.S, and passed to trap().
struct trapframe {
- // registers as pushed by pusha
- uint edi;
- uint esi;
- uint ebp;
- uint oesp; // useless & ignored
- uint ebx;
- uint edx;
- uint ecx;
- uint eax;
-
- // rest of trap frame
- ushort gs;
- ushort padding1;
- ushort fs;
- ushort padding2;
- ushort es;
- ushort padding3;
- ushort ds;
- ushort padding4;
- uint trapno;
-
- // below here defined by x86 hardware
- uint err;
- uint eip;
- ushort cs;
- ushort padding5;
- uint eflags;
-
- // below here only when crossing rings, such as from user to kernel
- uint esp;
- ushort ss;
- ushort padding6;
-};
+ uint64 rax;
+ uint64 rbx;
+ uint64 rcx;
+ uint64 rdx;
+ uint64 rbp;
+ uint64 rsi;
+ uint64 rdi;
+ uint64 r8;
+ uint64 r9;
+ uint64 r10;
+ uint64 r11;
+ uint64 r12;
+ uint64 r13;
+ uint64 r14;
+ uint64 r15;
+ uint64 trapno;
+ uint64 err;
+ uint64 rip;
+ uint16 cs;
+ uint16 padding[3];
+ uint64 rflags;
+ uint64 rsp;
+ uint64 ss;
+}__attribute__((packed));
+
+#endif
+
+#define TF_CS 144 // offset in trapframe for saved cs