diff options
| -rw-r--r-- | .gdbinit.tmpl | 27 | ||||
| -rw-r--r-- | .gdbinit.tmpl-i386 | 5 | ||||
| -rw-r--r-- | .gdbinit.tmpl-x64 | 18 | ||||
| -rw-r--r-- | Makefile | 79 | ||||
| -rw-r--r-- | bootasm.S | 88 | ||||
| -rw-r--r-- | console.c | 30 | ||||
| -rw-r--r-- | defs.h | 12 | ||||
| -rw-r--r-- | elf.h | 22 | ||||
| -rw-r--r-- | entry.S | 273 | ||||
| -rw-r--r-- | entryother.S | 57 | ||||
| -rw-r--r-- | exec.c | 30 | ||||
| -rw-r--r-- | initcode.S | 13 | ||||
| -rw-r--r-- | ioapic.c | 3 | ||||
| -rw-r--r-- | kalloc.c | 6 | ||||
| -rw-r--r-- | kernel.ld | 41 | ||||
| -rw-r--r-- | main.c | 69 | ||||
| -rw-r--r-- | memlayout.h | 7 | ||||
| -rw-r--r-- | mmu.h | 236 | ||||
| -rw-r--r-- | mp.c | 6 | ||||
| -rw-r--r-- | mp.h | 8 | ||||
| -rw-r--r-- | msr.h | 25 | ||||
| -rw-r--r-- | printf.c | 34 | ||||
| -rw-r--r-- | proc.c | 34 | ||||
| -rw-r--r-- | proc.h | 21 | ||||
| -rw-r--r-- | spinlock.c | 10 | ||||
| -rw-r--r-- | spinlock.h | 2 | ||||
| -rw-r--r-- | string.c | 2 | ||||
| -rw-r--r-- | swtch.S | 36 | ||||
| -rw-r--r-- | syscall.c | 58 | ||||
| -rw-r--r-- | sysfile.c | 6 | ||||
| -rw-r--r-- | trap.c | 29 | ||||
| -rw-r--r-- | trapasm.S | 150 | ||||
| -rw-r--r-- | traps.h | 1 | ||||
| -rw-r--r-- | types.h | 8 | ||||
| -rw-r--r-- | usertests.c | 38 | ||||
| -rw-r--r-- | usys.S | 2 | ||||
| -rwxr-xr-x | vectors.pl | 16 | ||||
| -rw-r--r-- | vm.c | 213 | ||||
| -rw-r--r-- | x86.h | 108 | 
39 files changed, 1050 insertions, 773 deletions
| diff --git a/.gdbinit.tmpl b/.gdbinit.tmpl deleted file mode 100644 index f71681a..0000000 --- a/.gdbinit.tmpl +++ /dev/null @@ -1,27 +0,0 @@ -set $lastcs = -1 - -define hook-stop -  # There doesn't seem to be a good way to detect if we're in 16- or -  # 32-bit mode, but in 32-bit mode we always run with CS == 8 in the -  # kernel and CS == 35 in user space -  if $cs == 8 || $cs == 35 -    if $lastcs != 8 && $lastcs != 35 -      set architecture i386 -    end -    x/i $pc -  else -    if $lastcs == -1 || $lastcs == 8 || $lastcs == 35 -      set architecture i8086 -    end -    # Translate the segment:offset into a physical address -    printf "[%4x:%4x] ", $cs, $eip -    x/i $cs*16+$eip -  end -  set $lastcs = $cs -end - -echo + target remote localhost:1234\n -target remote localhost:1234 - -echo + symbol-file kernel\n -symbol-file kernel diff --git a/.gdbinit.tmpl-i386 b/.gdbinit.tmpl-i386 new file mode 100644 index 0000000..f4f85d2 --- /dev/null +++ b/.gdbinit.tmpl-i386 @@ -0,0 +1,5 @@ +python +gdb.execute("target remote localhost:26000") +gdb.execute("set architecture i386") +gdb.execute("symbol-file kernel") +gdb.execute("break *0x7c00") diff --git a/.gdbinit.tmpl-x64 b/.gdbinit.tmpl-x64 new file mode 100644 index 0000000..9c120ff --- /dev/null +++ b/.gdbinit.tmpl-x64 @@ -0,0 +1,18 @@ +#if you would like to use gdb in 32bit mode, comment out lines 8 and 15, then uncomment +#the lines after. Note this will only work properly until 64bit mode is enabled in entry.S + +python +gdb.execute("set architecture i386:x86-64:intel") +gdb.execute("target remote localhost:26000") +gdb.execute("symbol-file kernel") +gdb.execute("break start64") +#gdb.execute("break *0x7c00") +try: +  gdb.execute("continue") +except:  +  pass +gdb.execute("disconnect") +gdb.execute("set architecture i386:x86-64") +#gdb.execute("set architecture i386") +gdb.execute("target remote localhost:26000") +gdb.execute("delete break 1") @@ -51,7 +51,7 @@ TOOLPREFIX := $(shell if i386-jos-elf-objdump -i 2>&1 | grep '^elf32-i386$$' >/d  endif  # If the makefile can't find QEMU, specify its path here -# QEMU = qemu-system-i386 +QEMU = qemu-system-x86_64  # Try to infer the correct QEMU  ifndef QEMU @@ -76,11 +76,16 @@ AS = $(TOOLPREFIX)gas  LD = $(TOOLPREFIX)ld  OBJCOPY = $(TOOLPREFIX)objcopy  OBJDUMP = $(TOOLPREFIX)objdump -CFLAGS = -fno-pic -static -fno-builtin -fno-strict-aliasing -O2 -Wall -MD -ggdb -m32 -Werror -fno-omit-frame-pointer + +XFLAGS = -m64 -mcmodel=large -ggdb +# CFLAGS = -fno-pic -static -fno-builtin -fno-strict-aliasing -O2 -Wall -MD -ggdb -Werror -fno-omit-frame-pointer +CFLAGS = -fno-pic -static -fno-builtin -fno-strict-aliasing -Wall -MD -ggdb -Werror -fno-omit-frame-pointer +CFLAGS += -ffreestanding -fno-common -nostdlib $(XFLAGS)  CFLAGS += $(shell $(CC) -fno-stack-protector -E -x c /dev/null >/dev/null 2>&1 && echo -fno-stack-protector) -ASFLAGS = -m32 -gdwarf-2 -Wa,-divide +ASFLAGS = -gdwarf-2 -Wa,-divide $(XFLAGS)  # FreeBSD ld wants ``elf_i386_fbsd'' -LDFLAGS += -m $(shell $(LD) -V | grep elf_i386 2>/dev/null | head -n 1) +LDFLAGS += -m $(shell $(LD) -V | grep elf_x86_64 2>/dev/null | head -n 1) +LDFLAGS += -z max-page-size=4096  # Disable PIE when possible (for Ubuntu 16.10 toolchain)  ifneq ($(shell $(CC) -dumpspecs 2>/dev/null | grep -e '[^f]no-pie'),) @@ -90,23 +95,10 @@ ifneq ($(shell $(CC) -dumpspecs 2>/dev/null | grep -e '[^f]nopie'),)  CFLAGS += -fno-pie -nopie  endif -xv6.img: bootblock kernel -	dd if=/dev/zero of=xv6.img count=10000 -	dd if=bootblock of=xv6.img conv=notrunc -	dd if=kernel of=xv6.img seek=1 conv=notrunc - -xv6memfs.img: bootblock kernelmemfs -	dd if=/dev/zero of=xv6memfs.img count=10000 -	dd if=bootblock of=xv6memfs.img conv=notrunc -	dd if=kernelmemfs of=xv6memfs.img seek=1 conv=notrunc - -bootblock: bootasm.S bootmain.c -	$(CC) $(CFLAGS) -fno-pic -O -nostdinc -I. -c bootmain.c -	$(CC) $(CFLAGS) -fno-pic -nostdinc -I. -c bootasm.S -	$(LD) $(LDFLAGS) -N -e start -Ttext 0x7C00 -o bootblock.o bootasm.o bootmain.o -	$(OBJDUMP) -S bootblock.o > bootblock.asm -	$(OBJCOPY) -S -O binary -j .text bootblock.o bootblock -	./sign.pl bootblock +kernel: $(OBJS) entry.o entryother initcode kernel.ld +	$(LD) $(LDFLAGS) -T kernel.ld -o kernel entry.o $(OBJS) -b binary initcode entryother +	$(OBJDUMP) -S kernel > kernel.asm +	$(OBJDUMP) -t kernel | sed '1,/SYMBOL TABLE/d; s/ .* / /; /^$$/d' > kernel.sym  entryother: entryother.S  	$(CC) $(CFLAGS) -fno-pic -nostdinc -I. -c entryother.S @@ -120,23 +112,6 @@ initcode: initcode.S  	$(OBJCOPY) -S -O binary initcode.out initcode  	$(OBJDUMP) -S initcode.o > initcode.asm -kernel: $(OBJS) entry.o entryother initcode kernel.ld -	$(LD) $(LDFLAGS) -T kernel.ld -o kernel entry.o $(OBJS) -b binary initcode entryother -	$(OBJDUMP) -S kernel > kernel.asm -	$(OBJDUMP) -t kernel | sed '1,/SYMBOL TABLE/d; s/ .* / /; /^$$/d' > kernel.sym - -# kernelmemfs is a copy of kernel that maintains the -# disk image in memory instead of writing to a disk. -# This is not so useful for testing persistent storage or -# exploring disk buffering implementations, but it is -# great for testing the kernel on real hardware without -# needing a scratch disk. -MEMFSOBJS = $(filter-out ide.o,$(OBJS)) memide.o -kernelmemfs: $(MEMFSOBJS) entry.o entryother initcode kernel.ld fs.img -	$(LD) $(LDFLAGS) -T kernel.ld -o kernelmemfs entry.o  $(MEMFSOBJS) -b binary initcode entryother fs.img -	$(OBJDUMP) -S kernelmemfs > kernelmemfs.asm -	$(OBJDUMP) -t kernelmemfs | sed '1,/SYMBOL TABLE/d; s/ .* / /; /^$$/d' > kernelmemfs.sym -  tags: $(OBJS) entryother.S _init  	etags *.S *.c @@ -190,8 +165,8 @@ fs.img: mkfs README $(UPROGS)  clean:   	rm -f *.tex *.dvi *.idx *.aux *.log *.ind *.ilg \  	*.o *.d *.asm *.sym vectors.S bootblock entryother \ -	initcode initcode.out kernel xv6.img fs.img kernelmemfs \ -	xv6memfs.img mkfs .gdbinit \ +	initcode initcode.out kernel fs.img kernelmemfs \ +	mkfs .gdbinit \  	$(UPROGS)  # make a printout @@ -204,12 +179,6 @@ xv6.pdf: $(PRINT)  print: xv6.pdf -# run in emulators - -bochs : fs.img xv6.img -	if [ ! -e .bochsrc ]; then ln -s dot-bochsrc .bochsrc; fi -	bochs -q -  # try to generate a unique GDB port  GDBPORT = $(shell expr `id -u` % 5000 + 25000)  # QEMU's gdb stub command line changed in 0.11 @@ -219,25 +188,21 @@ QEMUGDB = $(shell if $(QEMU) -help | grep -q '^-gdb'; \  ifndef CPUS  CPUS := 2  endif -QEMUOPTS = -drive file=fs.img,index=1,media=disk,format=raw -drive file=xv6.img,index=0,media=disk,format=raw -smp $(CPUS) -m 512 $(QEMUEXTRA) - -qemu: fs.img xv6.img +QEMUOPTS = -kernel kernel -drive file=fs.img,index=1,media=disk,format=raw -smp $(CPUS) -m 512 $(QEMUEXTRA) +qemu: fs.img  	$(QEMU) -serial mon:stdio $(QEMUOPTS) -qemu-memfs: xv6memfs.img -	$(QEMU) -drive file=xv6memfs.img,index=0,media=disk,format=raw -smp $(CPUS) -m 256 - -qemu-nox: fs.img xv6.img +qemu-nox: fs.img kernel  	$(QEMU) -nographic $(QEMUOPTS) -.gdbinit: .gdbinit.tmpl +.gdbinit: .gdbinit.tmpl-x64  	sed "s/localhost:1234/localhost:$(GDBPORT)/" < $^ > $@ -qemu-gdb: fs.img xv6.img .gdbinit +qemu-gdb: fs.img kernel .gdbinit  	@echo "*** Now run 'gdb'." 1>&2 -	$(QEMU) -serial mon:stdio $(QEMUOPTS) -S $(QEMUGDB) +	$(QEMU) $(QEMUOPTS) -S $(QEMUGDB) -qemu-nox-gdb: fs.img xv6.img .gdbinit +qemu-nox-gdb: fs.img kernel .gdbinit  	@echo "*** Now run 'gdb'." 1>&2  	$(QEMU) -nographic $(QEMUOPTS) -S $(QEMUGDB) diff --git a/bootasm.S b/bootasm.S deleted file mode 100644 index 257867c..0000000 --- a/bootasm.S +++ /dev/null @@ -1,88 +0,0 @@ -#include "asm.h" -#include "memlayout.h" -#include "mmu.h" - -# Start the first CPU: switch to 32-bit protected mode, jump into C. -# The BIOS loads this code from the first sector of the hard disk into -# memory at physical address 0x7c00 and starts executing in real mode -# with %cs=0 %ip=7c00. - -.code16                       # Assemble for 16-bit mode -.globl start -start: -  cli                         # BIOS enabled interrupts; disable - -  # Zero data segment registers DS, ES, and SS. -  xorw    %ax,%ax             # Set %ax to zero -  movw    %ax,%ds             # -> Data Segment -  movw    %ax,%es             # -> Extra Segment -  movw    %ax,%ss             # -> Stack Segment - -  # Physical address line A20 is tied to zero so that the first PCs  -  # with 2 MB would run software that assumed 1 MB.  Undo that. -seta20.1: -  inb     $0x64,%al               # Wait for not busy -  testb   $0x2,%al -  jnz     seta20.1 - -  movb    $0xd1,%al               # 0xd1 -> port 0x64 -  outb    %al,$0x64 - -seta20.2: -  inb     $0x64,%al               # Wait for not busy -  testb   $0x2,%al -  jnz     seta20.2 - -  movb    $0xdf,%al               # 0xdf -> port 0x60 -  outb    %al,$0x60 - -  # Switch from real to protected mode.  Use a bootstrap GDT that makes -  # virtual addresses map directly to physical addresses so that the -  # effective memory map doesn't change during the transition. -  lgdt    gdtdesc -  movl    %cr0, %eax -  orl     $CR0_PE, %eax -  movl    %eax, %cr0 - -//PAGEBREAK! -  # Complete the transition to 32-bit protected mode by using a long jmp -  # to reload %cs and %eip.  The segment descriptors are set up with no -  # translation, so that the mapping is still the identity mapping. -  ljmp    $(SEG_KCODE<<3), $start32 - -.code32  # Tell assembler to generate 32-bit code now. -start32: -  # Set up the protected-mode data segment registers -  movw    $(SEG_KDATA<<3), %ax    # Our data segment selector -  movw    %ax, %ds                # -> DS: Data Segment -  movw    %ax, %es                # -> ES: Extra Segment -  movw    %ax, %ss                # -> SS: Stack Segment -  movw    $0, %ax                 # Zero segments not ready for use -  movw    %ax, %fs                # -> FS -  movw    %ax, %gs                # -> GS - -  # Set up the stack pointer and call into C. -  movl    $start, %esp -  call    bootmain - -  # If bootmain returns (it shouldn't), trigger a Bochs -  # breakpoint if running under Bochs, then loop. -  movw    $0x8a00, %ax            # 0x8a00 -> port 0x8a00 -  movw    %ax, %dx -  outw    %ax, %dx -  movw    $0x8ae0, %ax            # 0x8ae0 -> port 0x8a00 -  outw    %ax, %dx -spin: -  jmp     spin - -# Bootstrap GDT -.p2align 2                                # force 4 byte alignment -gdt: -  SEG_NULLASM                             # null seg -  SEG_ASM(STA_X|STA_R, 0x0, 0xffffffff)   # code seg -  SEG_ASM(STA_W, 0x0, 0xffffffff)         # data seg - -gdtdesc: -  .word   (gdtdesc - gdt - 1)             # sizeof(gdt) - 1 -  .long   gdt                             # address gdt - @@ -2,6 +2,8 @@  // Input is from the keyboard or serial port.  // Output is written to the screen and serial port. +#include <stdarg.h> +  #include "types.h"  #include "defs.h"  #include "param.h" @@ -24,10 +26,11 @@ static struct {    int locking;  } cons; +static char digits[] = "0123456789abcdef"; +  static void  printint(int xx, int base, int sign)  { -  static char digits[] = "0123456789abcdef";    char buf[16];    int i;    uint x; @@ -48,14 +51,25 @@ printint(int xx, int base, int sign)    while(--i >= 0)      consputc(buf[i]);  } + +static void +printptr(uint64 x) { +  int i; +  consputc('0'); +  consputc('x'); +  for (i = 0; i < (sizeof(uint64) * 2); i++, x <<= 4) +    consputc(digits[x >> (sizeof(uint64) * 8 - 4)]); +} + +  //PAGEBREAK: 50  // Print to the console. only understands %d, %x, %p, %s.  void  cprintf(char *fmt, ...)  { +  va_list ap;    int i, c, locking; -  uint *argp;    char *s;    locking = cons.locking; @@ -65,7 +79,7 @@ cprintf(char *fmt, ...)    if (fmt == 0)      panic("null fmt"); -  argp = (uint*)(void*)(&fmt + 1); +  va_start(ap, fmt);    for(i = 0; (c = fmt[i] & 0xff) != 0; i++){      if(c != '%'){        consputc(c); @@ -76,14 +90,16 @@ cprintf(char *fmt, ...)        break;      switch(c){      case 'd': -      printint(*argp++, 10, 1); +      printint(va_arg(ap, int), 10, 1);        break;      case 'x': +      printint(va_arg(ap, int), 16, 1); +      break;      case 'p': -      printint(*argp++, 16, 0); +      printptr(va_arg(ap, uint64));        break;      case 's': -      if((s = (char*)*argp++) == 0) +      if((s = va_arg(ap, char*)) == 0)          s = "(null)";        for(; *s; s++)          consputc(*s); @@ -107,7 +123,7 @@ void  panic(char *s)  {    int i; -  uint pcs[10]; +  uint64 pcs[10];    cli();    cons.locking = 0; @@ -126,7 +126,7 @@ void            swtch(struct context**, struct context*);  // spinlock.c  void            acquire(struct spinlock*); -void            getcallerpcs(void*, uint*); +void            getcallerpcs(void*, uint64*);  int             holding(struct spinlock*);  void            initlock(struct spinlock*, char*);  void            release(struct spinlock*); @@ -152,8 +152,10 @@ char*           strncpy(char*, const char*, int);  int             argint(int, int*);  int             argptr(int, char**, int);  int             argstr(int, char**); -int             fetchint(uint, int*); -int             fetchstr(uint, char**); +int             argaddr(int, uint64 *); +int             fetchint(uint64, int*); +int             fetchstr(uint64, char**); +int             fetchaddr(uint64, uint64*);  void            syscall(void);  // timer.c @@ -176,8 +178,8 @@ void            kvmalloc(void);  pde_t*          setupkvm(void);  char*           uva2ka(pde_t*, char*);  int             allocuvm(pde_t*, uint, uint); -int             deallocuvm(pde_t*, uint, uint); -void            freevm(pde_t*); +int             deallocuvm(pde_t*, uint64, uint64); +void            freevm(pde_t*, uint64);  void            inituvm(pde_t*, char*, uint);  int             loaduvm(pde_t*, char*, struct inode*, uint, uint);  pde_t*          copyuvm(pde_t*, uint); @@ -9,9 +9,9 @@ struct elfhdr {    ushort type;    ushort machine;    uint version; -  uint entry; -  uint phoff; -  uint shoff; +  uint64 entry; +  uint64 phoff; +  uint64 shoff;    uint flags;    ushort ehsize;    ushort phentsize; @@ -23,14 +23,14 @@ struct elfhdr {  // Program section header  struct proghdr { -  uint type; -  uint off; -  uint vaddr; -  uint paddr; -  uint filesz; -  uint memsz; -  uint flags; -  uint align; +  uint32 type; +  uint32 flags; +  uint64 off; +  uint64 vaddr; +  uint64 paddr; +  uint64 filesz; +  uint64 memsz; +  uint64 align;  };  // Values for Proghdr type @@ -1,68 +1,223 @@ -# The xv6 kernel starts executing in this file. This file is linked with -# the kernel C code, so it can refer to kernel symbols such as main(). -# The boot block (bootasm.S and bootmain.c) jumps to entry below. -         -# Multiboot header, for multiboot boot loaders like GNU Grub. +# x86-64 bootstrap, assuming load by MultiBoot-compliant loader. +# The MutliBoot specification is at:  # http://www.gnu.org/software/grub/manual/multiboot/multiboot.html -# -# Using GRUB 2, you can boot xv6 from a file stored in a -# Linux file system by copying kernel or kernelmemfs to /boot -# and then adding this menu entry: -# -# menuentry "xv6" { -# 	insmod ext2 -# 	set root='(hd0,msdos1)' -# 	set kernel='/boot/kernel' -# 	echo "Loading ${kernel}..." -# 	multiboot ${kernel} ${kernel} -# 	boot -# } - -#include "asm.h" -#include "memlayout.h" +# GRUB is a MultiBoot loader, as is qemu's -kernel option. +  #include "mmu.h" -#include "param.h" +#include "memlayout.h"   + +# STACK is the size of the bootstrap stack. +#define STACK 8192 -# Multiboot header.  Data to direct multiboot loader. -.p2align 2 +# MultiBoot header. +# http://www.gnu.org/software/grub/manual/multiboot/multiboot.html#Header-layout +.align 4  .text  .globl multiboot_header  multiboot_header:    #define magic 0x1badb002 -  #define flags 0 +  #define flags (1<<16 | 1<<0)    .long magic    .long flags -  .long (-magic-flags) - -# By convention, the _start symbol specifies the ELF entry point. -# Since we haven't set up virtual memory yet, our entry point is -# the physical address of 'entry'. -.globl _start -_start = V2P_WO(entry) - -# Entering xv6 on boot processor, with paging off. -.globl entry -entry: -  # Turn on page size extension for 4Mbyte pages -  movl    %cr4, %eax -  orl     $(CR4_PSE), %eax -  movl    %eax, %cr4 -  # Set page directory -  movl    $(V2P_WO(entrypgdir)), %eax -  movl    %eax, %cr3 -  # Turn on paging. -  movl    %cr0, %eax -  orl     $(CR0_PG|CR0_WP), %eax -  movl    %eax, %cr0 - -  # Set up the stack pointer. -  movl $(stack + KSTACKSIZE), %esp - -  # Jump to main(), and switch to executing at -  # high addresses. The indirect call is needed because -  # the assembler produces a PC-relative instruction -  # for a direct jump. -  mov $main, %eax -  jmp *%eax - -.comm stack, KSTACKSIZE +  .long (- magic - flags)  # checksum +  .long V2P_WO(multiboot_header)  # header address +  .long V2P_WO(multiboot_header)  # load address +  .long V2P_WO(edata)       # load end address +  .long V2P_WO(end)         # bss end address +  .long V2P_WO(start)       # entry address + +# Entry point jumped to by boot loader.  Running in 32-bit mode. +# http://www.gnu.org/software/grub/manual/multiboot/multiboot.html#Machine-state +# +#       EAX = 0x2badb002 +#       EBX = address of multiboot information structure +#       CS = 32-bit read/execute code segment with identity map +#       DS, ES, FS, GS, SS = 32-bit read/write data segment with identity map +#       A20 gate = enabled +#       CR0 = PE set, PG clear +#       EFLAGS = VM clear, IF clear +# +.code32 +.globl start +start: +  # Tell BIOS to do "warm reboot" when we shut down. +  movw $0x1234, 0x472 + +  # Set up multiboot arguments for main. +  movl %eax, %edi +  movl %ebx, %esi + +  # Initialize stack. +  movl $V2P_WO(stack+STACK), %esp +   +  # Zero bss.  QEMU's MultiBoot seems not to. +  # It's possible that the header above is not right, but it looks right. +  # %edi is holding multiboot argument, so save in another register. +  # (The stack is in the bss.) +  movl %edi, %edx +  movl $V2P_WO(edata), %edi +  movl $V2P_WO(end), %ecx +  subl $V2P_WO(edata), %ecx +  movl $0, %eax +  cld +  rep stosb +  movl %edx, %edi + +  call loadgdt +   +  # Enter new 32-bit code segment (already in 32-bit mode). +  ljmp $KCSEG32, $V2P_WO(start32)  // code32 segment selector +   +start32: +  # Initialize page table. +  call initpagetables +  call init32e +   +  movl $V2P_WO(start64), %eax +  # Enter 64-bit mode. +  ljmp $KCSEG, $V2P_WO(tramp64)  // code64 segment selector + +.code64 +start64: +  # Load VA of stack +  movabsq $(stack+STACK), %rsp +  # Clear frame pointer for stack walks +  movl $0, %ebp +  # Call into C code. +  call bpmain +  # should not return from bpmain +  jmp . + +.code32 +.global apstart +apstart: +  call loadgdt +  ljmp $KCSEG32, $V2P_WO(apstart32)  // code32 segment selector +   +apstart32: +  call init32e +  movl $V2P_WO(apstart64), %eax +  ljmp $KCSEG, $V2P_WO(tramp64)  // code64 segment selector + +.code64        +apstart64: +  # Remember (from bootothers), that our kernel stack pointer is +  # at the top of our temporary stack. +  popq %rax +  movq %rax, %rsp +  movq $0, %rbp +  call apmain +1:      jmp 1b +   +.code64 +tramp64: +  # The linker thinks we are running at tramp64, but we're actually +  # running at PADDR(tramp64), so use an explicit calculation to +  # load and jump to the correct address.  %rax should hold the +  # physical address of the jmp target. +  movq $KERNBASE, %r11 +  addq %r11, %rax +  jmp *%rax + +# Initial stack +.comm stack, STACK + +# Page tables.  See section 4.5 of 253668.pdf. +# We map the first GB of physical memory at 0 and at 1 TB (not GB) before +# the end of virtual memory.  At boot time we are using the mapping at 0 +# but during ordinary execution we use the high mapping. +# The intent is that after bootstrap the kernel can expand this mapping +# to cover all the available physical memory. +# This would be easier if we could use the PS bit to create GB-sized entries +# and skip the pdt table, but not all chips support it, and QEMU doesn't. +.align 4096 +pml4: +  .quad V2P_WO(pdpt) + PTE_P + PTE_W   // present, read/write +  .quad 0 +  .space 4096 - 2*16 +  .quad V2P_WO(pdpt) + PTE_P + PTE_W +  .quad 0 + +.align 4096 +pdpt: +  .quad V2P_WO(pdt) + PTE_P + PTE_W +  .space 4096 - 8 + +.align 4096 +pdt: +  // Filled in below. +  .space 4096 + +.code32 +initpagetables: +  pushl %edi +  pushl %ecx +  pushl %eax + +  // Set up 64-bit entry in %edx:%eax. +  // Base address 0, present, read/write, large page. +  movl $(0 | PTE_P | PTE_W | PTE_PS), %eax +  movl $0, %edx + +  // Fill in 512 entries at pdt. +  movl $V2P_WO(pdt), %edi +  movl $512, %ecx +1: +  // Write this 64-bit entry. +  movl %eax, 0(%edi) +  movl %edx, 4(%edi) +  addl $8, %edi +  // 64-bit add to prepare address for next entry. +  // Because this is a large page entry, it covers 512 4k pages (2 MB). +  add $(512*4096), %eax +  adc $0, %edx +  loop 1b + +  popl %eax +  popl %ecx +  popl %edi +  ret + +# Initialize IA-32e mode.  See section 9.8.5 of 253668.pdf. +init32e: +  # Set CR4.PAE and CR4.PSE = 1. +  movl %cr4, %eax +  orl $0x30, %eax +  movl %eax, %cr4 + +  # Load CR3 with physical base address of level 4 page table. +  movl $V2P_WO(pml4), %eax +  movl %eax, %cr3 +   +  # Enable IA-32e mode by setting IA32_EFER.LME = 1. +  # Also turn on IA32_EFER.SCE (syscall enable). +  movl $0xc0000080, %ecx +  rdmsr +  orl $0x101, %eax +  wrmsr + +  # Enable paging by setting CR0.PG = 1. +  movl %cr0, %eax +  orl $0x80000000, %eax    +  movl %eax, %cr0 +  nop +  nop + +  ret + +loadgdt: +  subl $8, %esp +  movl $V2P_WO(bootgdt), 4(%esp) +  movw $(8*NSEGS-1), 2(%esp) +  lgdt 2(%esp) +  addl $8, %esp + +  movl $KDSEG, %eax  // data segment selector +  movw %ax, %ds +  movw %ax, %es +  movw %ax, %ss +  movl $0, %eax  // null segment selector +  movw %ax, %fs +  movw %ax, %gs + +  ret diff --git a/entryother.S b/entryother.S index a3b6dc2..3e502f3 100644 --- a/entryother.S +++ b/entryother.S @@ -13,11 +13,9 @@  #  # Startothers (in main.c) sends the STARTUPs one at a time.  # It copies this code (start) at 0x7000.  It puts the address of -# a newly allocated per-core stack in start-4,the address of the -# place to jump to (mpenter) in start-8, and the physical address +# a newly allocated per-core stack in start-12,the address of the +# place to jump to (apstart32) in start-4, and the physical address  # of entrypgdir in start-12. -# -# This code combines elements of bootasm.S and entry.S.  .code16             .globl start @@ -41,53 +39,22 @@ start:    # Complete the transition to 32-bit protected mode by using a long jmp    # to reload %cs and %eip.  The segment descriptors are set up with no    # translation, so that the mapping is still the identity mapping. -  ljmpl    $(SEG_KCODE<<3), $(start32) +  ljmpl    $(KCSEG32), $start32 -//PAGEBREAK! -.code32  # Tell assembler to generate 32-bit code now. +.code32  start32: -  # Set up the protected-mode data segment registers -  movw    $(SEG_KDATA<<3), %ax    # Our data segment selector -  movw    %ax, %ds                # -> DS: Data Segment -  movw    %ax, %es                # -> ES: Extra Segment -  movw    %ax, %ss                # -> SS: Stack Segment -  movw    $0, %ax                 # Zero segments not ready for use -  movw    %ax, %fs                # -> FS -  movw    %ax, %gs                # -> GS - -  # Turn on page size extension for 4Mbyte pages -  movl    %cr4, %eax -  orl     $(CR4_PSE), %eax -  movl    %eax, %cr4 -  # Use entrypgdir as our initial page table -  movl    (start-12), %eax -  movl    %eax, %cr3 -  # Turn on paging. -  movl    %cr0, %eax -  orl     $(CR0_PE|CR0_PG|CR0_WP), %eax -  movl    %eax, %cr0 + movl $start-12, %esp + movl start-4, %ecx + jmp *%ecx -  # Switch to the stack allocated by startothers() -  movl    (start-4), %esp -  # Call mpenter() -  call	 *(start-8) - -  movw    $0x8a00, %ax -  movw    %ax, %dx -  outw    %ax, %dx -  movw    $0x8ae0, %ax -  outw    %ax, %dx -spin: -  jmp     spin - -.p2align 2 +.align 4  gdt:    SEG_NULLASM -  SEG_ASM(STA_X|STA_R, 0, 0xffffffff) -  SEG_ASM(STA_W, 0, 0xffffffff) - +  SEG_ASM(0xa, 0, 0xffffffff) +  SEG_ASM(0x2, 0, 0xffffffff) +.align  16  gdtdesc: -  .word   (gdtdesc - gdt - 1) +  .word   0x17 # sizeof(gdt)-1    .long   gdt @@ -4,6 +4,8 @@  #include "mmu.h"  #include "proc.h"  #include "defs.h" +#include "traps.h" +#include "msr.h"  #include "x86.h"  #include "elf.h" @@ -12,18 +14,18 @@ exec(char *path, char **argv)  {    char *s, *last;    int i, off; -  uint argc, sz, sp, ustack[3+MAXARG+1]; +  uint64 argc, sz, sp, ustack[3+MAXARG+1];    struct elfhdr elf;    struct inode *ip;    struct proghdr ph;    pde_t *pgdir, *oldpgdir;    struct proc *curproc = myproc(); - +  uint64 oldsz = curproc->sz; +      begin_op();    if((ip = namei(path)) == 0){      end_op(); -    cprintf("exec: fail\n");      return -1;    }    ilock(ip); @@ -72,7 +74,7 @@ exec(char *path, char **argv)    for(argc = 0; argv[argc]; argc++) {      if(argc >= MAXARG)        goto bad; -    sp = (sp - (strlen(argv[argc]) + 1)) & ~3; +    sp = (sp - (strlen(argv[argc]) + 1)) & ~(sizeof(uint64)-1);      if(copyout(pgdir, sp, argv[argc], strlen(argv[argc]) + 1) < 0)        goto bad;      ustack[3+argc] = sp; @@ -81,10 +83,13 @@ exec(char *path, char **argv)    ustack[0] = 0xffffffff;  // fake return PC    ustack[1] = argc; -  ustack[2] = sp - (argc+1)*4;  // argv pointer +  ustack[2] = sp - (argc+1)*sizeof(uint64);  // argv pointer + +  curproc->tf->rdi = argc; +  curproc->tf->rsi = sp - (argc+1)*sizeof(uint64); -  sp -= (3+argc+1) * 4; -  if(copyout(pgdir, sp, ustack, (3+argc+1)*4) < 0) +  sp -= (3+argc+1) * sizeof(uint64); +  if(copyout(pgdir, sp, ustack, (3+argc+1)*sizeof(uint64)) < 0)      goto bad;    // Save program name for debugging. @@ -92,20 +97,21 @@ exec(char *path, char **argv)      if(*s == '/')        last = s+1;    safestrcpy(curproc->name, last, sizeof(curproc->name)); - +        // Commit to the user image.    oldpgdir = curproc->pgdir;    curproc->pgdir = pgdir;    curproc->sz = sz; -  curproc->tf->eip = elf.entry;  // main -  curproc->tf->esp = sp; +  curproc->tf->rip = elf.entry;  // main +  curproc->tf->rcx = elf.entry; +  curproc->tf->rsp = sp;    switchuvm(curproc); -  freevm(oldpgdir); +  freevm(oldpgdir, oldsz);    return 0;   bad:    if(pgdir) -    freevm(pgdir); +    freevm(pgdir, sz);    if(ip){      iunlockput(ip);      end_op(); @@ -8,16 +8,15 @@  # exec(init, argv)  .globl start  start: -  pushl $argv -  pushl $init -  pushl $0  // where caller pc would be -  movl $SYS_exec, %eax -  int $T_SYSCALL +  mov $init, %rdi +  mov $argv, %rsi +  mov $SYS_exec, %rax   +  syscall  # for(;;) exit();  exit: -  movl $SYS_exit, %eax -  int $T_SYSCALL +  mov $SYS_exit, %rax +  syscall    jmp exit  # char init[] = "/init\0"; @@ -4,6 +4,7 @@  #include "types.h"  #include "defs.h" +#include "memlayout.h"  #include "traps.h"  #define IOAPIC  0xFEC00000   // Default physical address of IO APIC @@ -50,7 +51,7 @@ ioapicinit(void)  {    int i, id, maxintr; -  ioapic = (volatile struct ioapic*)IOAPIC; +  ioapic = P2V((volatile struct ioapic*)IOAPIC);    maxintr = (ioapicread(REG_VER) >> 16) & 0xFF;    id = ioapicread(REG_ID) >> 24;    if(id != ioapicid) @@ -47,7 +47,7 @@ void  freerange(void *vstart, void *vend)  {    char *p; -  p = (char*)PGROUNDUP((uint)vstart); +  p = (char*)PGROUNDUP((uint64)vstart);    for(; p + PGSIZE <= (char*)vend; p += PGSIZE)      kfree(p);  } @@ -61,7 +61,7 @@ kfree(char *v)  {    struct run *r; -  if((uint)v % PGSIZE || v < end || V2P(v) >= PHYSTOP) +  if((uint64)v % PGSIZE || v < end || V2P(v) >= PHYSTOP)      panic("kfree");    // Fill with junk to catch dangling refs. @@ -91,6 +91,8 @@ kalloc(void)      kmem.freelist = r->next;    if(kmem.use_lock)      release(&kmem.lock); +  if(r != 0 && (uint64) r < KERNBASE) +    panic("kalloc");    return (char*)r;  } @@ -1,22 +1,13 @@ -/* Simple linker script for the JOS kernel. -   See the GNU ld 'info' manual ("info ld") to learn the syntax. */ - -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") -OUTPUT_ARCH(i386) -ENTRY(_start) +OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") +OUTPUT_ARCH(i386:x86-64)  SECTIONS  { -	/* Link the kernel at this address: "." means the current address */ -        /* Must be equal to KERNLINK */ -	. = 0x80100000; - +	. = 0xFFFFFF0000100000; +	PROVIDE(text = .);  	.text : AT(0x100000) {  		*(.text .stub .text.* .gnu.linkonce.t.*)  	} - -	PROVIDE(etext = .);	/* Define the 'etext' symbol to this value */ -  	.rodata : {  		*(.rodata .rodata.* .gnu.linkonce.r.*)  	} @@ -38,31 +29,21 @@ SECTIONS  				   for this section */  	} -	/* Adjust the address for the data segment to the next page */  	. = ALIGN(0x1000); -	/* Conventionally, Unix linkers provide pseudo-symbols -	 * etext, edata, and end, at the end of the text, data, and bss. -	 * For the kernel mapping, we need the address at the beginning -	 * of the data section, but that's not one of the conventional -	 * symbols, because the convention started before there was a -	 * read-only rodata section between text and data. */ -	PROVIDE(data = .); - -	/* The data segment */ +        /* Conventionally, Unix linkers provide pseudo-symbols +         * etext, edata, and end, at the end of the text, data, and bss. +         * For the kernel mapping, we need the address at the beginning +         * of the data section, but that's not one of the conventional +         * symbols, because the convention started before there was a +         * read-only rodata section between text and data. */ +        PROVIDE(data = .);  	.data : {  		*(.data)  	} -  	PROVIDE(edata = .); -  	.bss : {  		*(.bss)  	} -  	PROVIDE(end = .); - -	/DISCARD/ : { -		*(.eh_frame .note.GNU-stack) -	}  } @@ -6,17 +6,22 @@  #include "proc.h"  #include "x86.h" -static void startothers(void); -static void mpmain(void)  __attribute__((noreturn));  extern pde_t *kpgdir;  extern char end[]; // first address after kernel loaded from ELF file +static void main(void)  __attribute__((noreturn)); +static void startothers(void); + +  // Bootstrap processor starts running C code here.  // Allocate a real stack and switch to it, first  // doing some setup required for memory allocator to work.  int -main(void) +bpmain(uint64 mbmagic, uint64 mbaddr)  { +  if(mbmagic != 0x2badb002) +       panic("multiboot header not found"); +    kinit1(end, P2V(4*1024*1024)); // phys page allocator    kvmalloc();      // kernel page table    mpinit();        // detect other processors @@ -30,26 +35,19 @@ main(void)    tvinit();        // trap vectors    binit();         // buffer cache    fileinit();      // file table -  ideinit();       // disk  +  ideinit();       // disk +      startothers();   // start other processors +     kinit2(P2V(4*1024*1024), P2V(PHYSTOP)); // must come after startothers()    userinit();      // first user process -  mpmain();        // finish this processor's setup -} - -// Other CPUs jump here from entryother.S. -static void -mpenter(void) -{ -  switchkvm(); -  seginit(); -  lapicinit(); -  mpmain(); +  main();         +  return 0;  }  // Common CPU setup code.  static void -mpmain(void) +main(void)  {    cprintf("cpu%d: starting %d\n", cpuid(), cpuid());    idtinit();       // load idt register @@ -57,7 +55,17 @@ mpmain(void)    scheduler();     // start running processes  } -pde_t entrypgdir[];  // For entry.S +// Other CPUs jump here from entryother.S. +void +apmain(void) +{ +  switchkvm(); +  seginit(); +  lapicinit(); +  main(); +} + +void apstart(void);  // Start the non-boot (AP) processors.  static void @@ -72,7 +80,7 @@ startothers(void)    // The linker has placed the image of entryother.S in    // _binary_entryother_start.    code = P2V(0x7000); -  memmove(code, _binary_entryother_start, (uint)_binary_entryother_size); +  memmove(code, _binary_entryother_start, (uint64)_binary_entryother_size);    for(c = cpus; c < cpus+ncpu; c++){      if(c == mycpu())  // We've started already. @@ -82,9 +90,8 @@ startothers(void)      // pgdir to use. We cannot use kpgdir yet, because the AP processor      // is running in low  memory, so we use entrypgdir for the APs too.      stack = kalloc(); -    *(void**)(code-4) = stack + KSTACKSIZE; -    *(void(**)(void))(code-8) = mpenter; -    *(int**)(code-12) = (void *) V2P(entrypgdir); +    *(uint32*)(code-4) = V2P(apstart); +    *(uint64*)(code-12) = (uint64) (stack+KSTACKSIZE);      lapicstartap(c->apicid, V2P(code)); @@ -94,23 +101,3 @@ startothers(void)    }  } -// The boot page table used in entry.S and entryother.S. -// Page directories (and page tables) must start on page boundaries, -// hence the __aligned__ attribute. -// PTE_PS in a page directory entry enables 4Mbyte pages. - -__attribute__((__aligned__(PGSIZE))) -pde_t entrypgdir[NPDENTRIES] = { -  // Map VA's [0, 4MB) to PA's [0, 4MB) -  [0] = (0) | PTE_P | PTE_W | PTE_PS, -  // Map VA's [KERNBASE, KERNBASE+4MB) to PA's [0, 4MB) -  [KERNBASE>>PDXSHIFT] = (0) | PTE_P | PTE_W | PTE_PS, -}; - -//PAGEBREAK! -// Blank page. -//PAGEBREAK! -// Blank page. -//PAGEBREAK! -// Blank page. - diff --git a/memlayout.h b/memlayout.h index d1615f7..87818d3 100644 --- a/memlayout.h +++ b/memlayout.h @@ -2,13 +2,14 @@  #define EXTMEM  0x100000            // Start of extended memory  #define PHYSTOP 0xE000000           // Top physical memory -#define DEVSPACE 0xFE000000         // Other devices are at high addresses +#define DEVSPACE 0xFE000000         // Other devices are top of 32-bit address space +#define DEVSPACETOP 0x100000000  // Key addresses for address space layout (see kmap in vm.c for layout) -#define KERNBASE 0x80000000         // First kernel virtual address +#define KERNBASE  0xFFFFFF0000000000  // First kernel virtual address  #define KERNLINK (KERNBASE+EXTMEM)  // Address where kernel is linked -#define V2P(a) (((uint) (a)) - KERNBASE) +#define V2P(a) (((uint64) (a)) - KERNBASE)  #define P2V(a) ((void *)(((char *) (a)) + KERNBASE))  #define V2P_WO(x) ((x) - KERNBASE)    // same as V2P, but without casts @@ -2,8 +2,10 @@  // x86 memory management unit (MMU).  // Eflags register +#define FL_TF           0x00000100      // Trap Flag  #define FL_IF           0x00000200      // Interrupt Enable +  // Control Register flags  #define CR0_PE          0x00000001      // Protection Enable  #define CR0_WP          0x00010000      // Write Protect @@ -11,81 +13,104 @@  #define CR4_PSE         0x00000010      // Page size extension -// various segment selectors. -#define SEG_KCODE 1  // kernel code -#define SEG_KDATA 2  // kernel data+stack -#define SEG_UCODE 3  // user code -#define SEG_UDATA 4  // user data+stack -#define SEG_TSS   5  // this process's task state +// Segment selectors (indexes) in our GDTs. +// Defined by our convention, not the architecture. +#define KCSEG32 (1<<3)  /* kernel 32-bit code segment */ +#define KCSEG   (2<<3)  /* kernel code segment */ +#define KDSEG   (3<<3)  /* kernel data segment */ +#define TSSSEG  (4<<3)  /* tss segment - takes two slots */ +#define UDSEG   (6<<3)  /* user data segment */ +#define UCSEG   (7<<3)  /* user code segment */ -// cpu->gdt[NSEGS] holds the above segments. -#define NSEGS     6 +#define NSEGS 8  #ifndef __ASSEMBLER__ -// Segment Descriptor  struct segdesc { -  uint lim_15_0 : 16;  // Low bits of segment limit -  uint base_15_0 : 16; // Low bits of segment base address -  uint base_23_16 : 8; // Middle bits of segment base address -  uint type : 4;       // Segment type (see STS_ constants) -  uint s : 1;          // 0 = system, 1 = application -  uint dpl : 2;        // Descriptor Privilege Level -  uint p : 1;          // Present -  uint lim_19_16 : 4;  // High bits of segment limit -  uint avl : 1;        // Unused (available for software use) -  uint rsv1 : 1;       // Reserved -  uint db : 1;         // 0 = 16-bit segment, 1 = 32-bit segment -  uint g : 1;          // Granularity: limit scaled by 4K when set -  uint base_31_24 : 8; // High bits of segment base address +	uint16 limit0; +	uint16 base0; +	uint8 base1; +	uint8 bits; +	uint8 bitslimit1; +	uint8 base2;  }; -// Normal segment -#define SEG(type, base, lim, dpl) (struct segdesc)    \ -{ ((lim) >> 12) & 0xffff, (uint)(base) & 0xffff,      \ -  ((uint)(base) >> 16) & 0xff, type, 1, dpl, 1,       \ -  (uint)(lim) >> 28, 0, 0, 1, 1, (uint)(base) >> 24 } -#define SEG16(type, base, lim, dpl) (struct segdesc)  \ -{ (lim) & 0xffff, (uint)(base) & 0xffff,              \ -  ((uint)(base) >> 16) & 0xff, type, 1, dpl, 1,       \ -  (uint)(lim) >> 16, 0, 0, 1, 0, (uint)(base) >> 24 } +// SEGDESC constructs a segment descriptor literal +// with the given, base, limit, and type bits. +#define SEGDESC(base, limit, bits) (struct segdesc){ \ +	(limit)&0xffff, (base)&0xffff, \ +	((base)>>16)&0xff, \ +	(bits)&0xff, \ +	(((bits)>>4)&0xf0) | ((limit>>16)&0xf), \ +	((base)>>24)&0xff, \ +} + +// SEGDESCHI constructs an extension segment descriptor +// literal that records the high bits of base. +#define SEGDESCHI(base) (struct segdesc) {                        \ +  (((base)>>32)&0xffff), (((base)>>48)&0xffff), \ +} +  #endif  #define DPL_USER    0x3     // User DPL +#define SEG_A      (1<<0)      /* segment accessed bit */ +#define SEG_R      (1<<1)      /* readable (code) */ +#define SEG_W      (1<<1)      /* writable (data) */ +#define SEG_C      (1<<2)      /* conforming segment (code) */ +#define SEG_E      (1<<2)      /* expand-down bit (data) */ +#define SEG_CODE   (1<<3)      /* code segment (instead of data) */ + +// User and system segment bits. +#define SEG_S      (1<<4)      /* if 0, system descriptor */ +#define SEG_DPL(x) ((x)<<5)    /* descriptor privilege level (2 bits) */ +#define SEG_P      (1<<7)      /* segment present */ +#define SEG_AVL    (1<<8)      /* available for operating system use */ +#define SEG_L      (1<<9)      /* long mode */ +#define SEG_D      (1<<10)     /* default operation size 32-bit */ +#define SEG_G      (1<<11)     /* granularity */ +  // Application segment type bits  #define STA_X       0x8     // Executable segment  #define STA_W       0x2     // Writeable (non-executable segments)  #define STA_R       0x2     // Readable (executable segments)  // System segment type bits -#define STS_T32A    0x9     // Available 32-bit TSS -#define STS_IG32    0xE     // 32-bit Interrupt Gate -#define STS_TG32    0xF     // 32-bit Trap Gate - -// A virtual address 'la' has a three-part structure as follows: +#define SEG_LDT    (2<<0)      /* local descriptor table */ +#define SEG_TSS64A (9<<0)      /* available 64-bit TSS */ +#define SEG_TSS64B (11<<0)     /* busy 64-bit TSS */ +#define SEG_CALL64 (12<<0)     /* 64-bit call gate */ +#define SEG_INTR64 (14<<0)     /* 64-bit interrupt gate */ +#define SEG_TRAP64 (15<<0)     /* 64-bit trap gate */ + +// A virtual address 'la' has a six-part structure as follows:  // -// +--------10------+-------10-------+---------12----------+ -// | Page Directory |   Page Table   | Offset within Page  | -// |      Index     |      Index     |                     | -// +----------------+----------------+---------------------+ -//  \--- PDX(va) --/ \--- PTX(va) --/ - +// +--16--+---9---+------9-------+-----9----+----9-------+----12-------+ +// | Sign | PML4  |Page Directory| Page Dir |Page Table  | Offset Page | +// |Extend| Index | Pointer Index|  Index   |  Index     | in Page     | +// +------+-------+--------------+----------+------------+-------------+ +//       \-PMX(va)-/\-PDPX(va)--/ \-PDX(va)-/ \-PTX(va)-/ + +#define PMX(va)         (((uint64)(va) >> PML4XSHIFT) & PXMASK) +#define PDPX(va)         (((uint64)(va) >> PDPXSHIFT) & PXMASK)  // page directory index -#define PDX(va)         (((uint)(va) >> PDXSHIFT) & 0x3FF) - +#define PDX(va)         (((uint64)(va) >> PDXSHIFT) & PXMASK)  // page table index -#define PTX(va)         (((uint)(va) >> PTXSHIFT) & 0x3FF) +#define PTX(va)         (((uint64)(va) >> PTXSHIFT) & PXMASK)  // construct virtual address from indexes and offset -#define PGADDR(d, t, o) ((uint)((d) << PDXSHIFT | (t) << PTXSHIFT | (o))) +#define PGADDR(d, t, o) ((uint64)((d) << PDXSHIFT | (t) << PTXSHIFT | (o)))  // Page directory and page table constants. -#define NPDENTRIES      1024    // # directory entries per page directory -#define NPTENTRIES      1024    // # PTEs per page table +#define NPDENTRIES      512    // # directory entries per page directory +#define NPTENTRIES      512    // # PTEs per page table  #define PGSIZE          4096    // bytes mapped by a page  #define PTXSHIFT        12      // offset of PTX in a linear address -#define PDXSHIFT        22      // offset of PDX in a linear address +#define PDXSHIFT        21      // offset of PDX in a linear address +#define PDPXSHIFT       30      // offset of PDPX in a linear address +#define PML4XSHIFT      39      // offset of PML4X in a linear address +#define PXMASK          0X1FF  #define PGROUNDUP(sz)  (((sz)+PGSIZE-1) & ~(PGSIZE-1))  #define PGROUNDDOWN(a) (((a)) & ~(PGSIZE-1)) @@ -95,87 +120,54 @@ struct segdesc {  #define PTE_W           0x002   // Writeable  #define PTE_U           0x004   // User  #define PTE_PS          0x080   // Page Size +#define PTE_PWT         0x008   // Write-Through +#define PTE_PCD         0x010   // Cache-Disable  // Address in page table or page directory entry -#define PTE_ADDR(pte)   ((uint)(pte) & ~0xFFF) -#define PTE_FLAGS(pte)  ((uint)(pte) &  0xFFF) +#define PTE_ADDR(pte)   ((uint64)(pte) & ~0xFFF) +#define PTE_FLAGS(pte)  ((uint64)(pte) &  0xFFF)  #ifndef __ASSEMBLER__ -typedef uint pte_t; -// Task state segment format -struct taskstate { -  uint link;         // Old ts selector -  uint esp0;         // Stack pointers and segment selectors -  ushort ss0;        //   after an increase in privilege level -  ushort padding1; -  uint *esp1; -  ushort ss1; -  ushort padding2; -  uint *esp2; -  ushort ss2; -  ushort padding3; -  void *cr3;         // Page directory base -  uint *eip;         // Saved state from last task switch -  uint eflags; -  uint eax;          // More saved state (registers) -  uint ecx; -  uint edx; -  uint ebx; -  uint *esp; -  uint *ebp; -  uint esi; -  uint edi; -  ushort es;         // Even more saved state (segment selectors) -  ushort padding4; -  ushort cs; -  ushort padding5; -  ushort ss; -  ushort padding6; -  ushort ds; -  ushort padding7; -  ushort fs; -  ushort padding8; -  ushort gs; -  ushort padding9; -  ushort ldt; -  ushort padding10; -  ushort t;          // Trap on task switch -  ushort iomb;       // I/O map base address -}; +typedef uint64 pml4e_t; +typedef uint64 pdpe_t; +typedef uint64 pte_t; -// Gate descriptors for interrupts and traps -struct gatedesc { -  uint off_15_0 : 16;   // low 16 bits of offset in segment -  uint cs : 16;         // code segment selector -  uint args : 5;        // # args, 0 for interrupt/trap gates -  uint rsv1 : 3;        // reserved(should be zero I guess) -  uint type : 4;        // type(STS_{IG32,TG32}) -  uint s : 1;           // must be 0 (system) -  uint dpl : 2;         // descriptor(meaning new) privilege level -  uint p : 1;           // Present -  uint off_31_16 : 16;  // high bits of offset in segment +struct taskstate { +  uint8 reserved0[4]; +  uint64 rsp[3]; +  uint64 ist[8]; +  uint8 reserved1[10]; +  uint16 iomba; +  uint8 iopb[0]; +} __attribute__ ((packed)); + +#define INT_P      (1<<7)      /* interrupt descriptor present */ + +struct intgate +{ +	uint16 rip0; +	uint16 cs; +	uint8 reserved0; +	uint8 bits; +	uint16 rip1; +	uint32 rip2; +	uint32 reserved1;  }; -// Set up a normal interrupt/trap gate descriptor. -// - istrap: 1 for a trap (= exception) gate, 0 for an interrupt gate. -//   interrupt gate clears FL_IF, trap gate leaves FL_IF alone -// - sel: Code segment selector for interrupt/trap handler -// - off: Offset in code segment for interrupt/trap handler -// - dpl: Descriptor Privilege Level - -//        the privilege level required for software to invoke -//        this interrupt/trap gate explicitly using an int instruction. -#define SETGATE(gate, istrap, sel, off, d)                \ -{                                                         \ -  (gate).off_15_0 = (uint)(off) & 0xffff;                \ -  (gate).cs = (sel);                                      \ -  (gate).args = 0;                                        \ -  (gate).rsv1 = 0;                                        \ -  (gate).type = (istrap) ? STS_TG32 : STS_IG32;           \ -  (gate).s = 0;                                           \ -  (gate).dpl = (d);                                       \ -  (gate).p = 1;                                           \ -  (gate).off_31_16 = (uint)(off) >> 16;                  \ +// INTDESC constructs an interrupt descriptor literal +// that records the given code segment, instruction pointer, +// and type bits. +#define INTDESC(cs, rip, bits) (struct intgate){ \ +	(rip)&0xffff, (cs), 0, bits, ((rip)>>16)&0xffff, \ +	(uint64)(rip)>>32, 0, \  } +// See section 4.6 of amd64 vol2 +struct desctr +{ +  uint16 limit; +  uint64 base; +} __attribute__((packed, aligned(16)));   // important! +  #endif @@ -28,7 +28,7 @@ sum(uchar *addr, int len)  // Look for an MP structure in the len bytes at addr.  static struct mp* -mpsearch1(uint a, int len) +mpsearch1(uint64 a, int len)  {    uchar *e, *p, *addr; @@ -77,7 +77,7 @@ mpconfig(struct mp **pmp)    if((mp = mpsearch()) == 0 || mp->physaddr == 0)      return 0; -  conf = (struct mpconf*) P2V((uint) mp->physaddr); +  conf = (struct mpconf*) P2V((uint64) mp->physaddr);    if(memcmp(conf, "PCMP", 4) != 0)      return 0;    if(conf->version != 1 && conf->version != 4) @@ -101,7 +101,7 @@ mpinit(void)    if((conf = mpconfig(&mp)) == 0)      panic("Expect to run on an SMP");    ismp = 1; -  lapic = (uint*)conf->lapicaddr; +  lapic = P2V((uint64)conf->lapicaddr_p);    for(p=(uchar*)(conf+1), e=(uchar*)conf+conf->length; p<e; ){      switch(*p){      case MPPROC: @@ -2,7 +2,7 @@  struct mp {             // floating pointer    uchar signature[4];           // "_MP_" -  void *physaddr;               // phys addr of MP config table +  uint32 physaddr;               // phys addr of MP config table    uchar length;                 // 1    uchar specrev;                // [14]    uchar checksum;               // all bytes must add up to 0 @@ -17,10 +17,10 @@ struct mpconf {         // configuration table header    uchar version;                // [14]    uchar checksum;               // all bytes must add up to 0    uchar product[20];            // product id -  uint *oemtable;               // OEM table pointer +  uint32 oemtable;               // OEM table pointer    ushort oemlength;             // OEM table length    ushort entry;                 // entry count -  uint *lapicaddr;              // address of local APIC +  uint32 lapicaddr_p;              // address of local APIC    ushort xlength;               // extended table length    uchar xchecksum;              // extended table checksum    uchar reserved; @@ -42,7 +42,7 @@ struct mpioapic {       // I/O APIC table entry    uchar apicno;                 // I/O APIC id    uchar version;                // I/O APIC version    uchar flags;                  // I/O APIC flags -  uint *addr;                  // I/O APIC address +  uint32 addr_p;                  // I/O APIC address  };  // Table entry types @@ -0,0 +1,25 @@ +// SYSCALL and SYSRET registers +#define MSR_STAR        0xc0000081 +#define MSR_LSTAR       0xc0000082 +#define MSR_CSTAR       0xc0000083 +#define MSR_SFMASK      0xc0000084 + +// GS +#define MSR_GS_BASE     0xc0000101 +#define MSR_GS_KERNBASE 0xc0000102 + +static inline uint64 +readmsr(uint32 msr) +{ +  uint32 hi, lo; +  __asm volatile("rdmsr" : "=d" (hi), "=a" (lo) : "c" (msr)); +  return ((uint64) lo) | (((uint64) hi) << 32); +} + +static inline void +writemsr(uint64 msr, uint64 val) +{ +  uint32 lo = val & 0xffffffff; +  uint32 hi = val >> 32; +  __asm volatile("wrmsr" : : "c" (msr), "a" (lo), "d" (hi) : "memory"); +} @@ -2,6 +2,10 @@  #include "stat.h"  #include "user.h" +#include <stdarg.h> + +static char digits[] = "0123456789ABCDEF"; +  static void  putc(int fd, char c)  { @@ -11,7 +15,6 @@ putc(int fd, char c)  static void  printint(int fd, int xx, int base, int sgn)  { -  static char digits[] = "0123456789ABCDEF";    char buf[16];    int i, neg;    uint x; @@ -35,16 +38,25 @@ printint(int fd, int xx, int base, int sgn)      putc(fd, buf[i]);  } +static void +printptr(int fd, uint64 x) { +  int i; +  putc(fd, '0'); +  putc(fd, 'x'); +  for (i = 0; i < (sizeof(uint64) * 2); i++, x <<= 4) +    putc(fd, digits[x >> (sizeof(uint64) * 8 - 4)]); +} +  // Print to the given fd. Only understands %d, %x, %p, %s.  void  printf(int fd, const char *fmt, ...)  { +  va_list ap;    char *s;    int c, i, state; -  uint *ap; +  va_start(ap, fmt);    state = 0; -  ap = (uint*)(void*)&fmt + 1;    for(i = 0; fmt[i]; i++){      c = fmt[i] & 0xff;      if(state == 0){ @@ -55,14 +67,13 @@ printf(int fd, const char *fmt, ...)        }      } else if(state == '%'){        if(c == 'd'){ -        printint(fd, *ap, 10, 1); -        ap++; -      } else if(c == 'x' || c == 'p'){ -        printint(fd, *ap, 16, 0); -        ap++; +        printint(fd, va_arg(ap, int), 10, 1); +      } else if(c == 'x') { +        printint(fd, va_arg(ap, int), 16, 0); +      } else if(c == 'p') { +        printptr(fd, va_arg(ap, uint64));        } else if(c == 's'){ -        s = (char*)*ap; -        ap++; +        s = va_arg(ap, char*);          if(s == 0)            s = "(null)";          while(*s != 0){ @@ -70,8 +81,7 @@ printf(int fd, const char *fmt, ...)            s++;          }        } else if(c == 'c'){ -        putc(fd, *ap); -        ap++; +        putc(fd, va_arg(ap, uint));        } else if(c == '%'){          putc(fd, c);        } else { @@ -6,6 +6,7 @@  #include "x86.h"  #include "proc.h"  #include "spinlock.h" +#include "msr.h"  struct {    struct spinlock lock; @@ -16,7 +17,7 @@ static struct proc *initproc;  int nextpid = 1;  extern void forkret(void); -extern void trapret(void); +extern void sysexit(void);  static void wakeup1(void *chan); @@ -104,13 +105,13 @@ found:    // Set up new context to start executing at forkret,    // which returns to trapret. -  sp -= 4; -  *(uint*)sp = (uint)trapret; +  sp -= sizeof(uint64); +  *(uint64*)sp = (uint64)sysexit;    sp -= sizeof *p->context;    p->context = (struct context*)sp;    memset(p->context, 0, sizeof *p->context); -  p->context->eip = (uint)forkret; +  p->context->eip = (uint64)forkret;    return p;  } @@ -128,16 +129,12 @@ userinit(void)    initproc = p;    if((p->pgdir = setupkvm()) == 0)      panic("userinit: out of memory?"); -  inituvm(p->pgdir, _binary_initcode_start, (int)_binary_initcode_size); +  inituvm(p->pgdir, _binary_initcode_start, (uint64)_binary_initcode_size);    p->sz = PGSIZE;    memset(p->tf, 0, sizeof(*p->tf)); -  p->tf->cs = (SEG_UCODE << 3) | DPL_USER; -  p->tf->ds = (SEG_UDATA << 3) | DPL_USER; -  p->tf->es = p->tf->ds; -  p->tf->ss = p->tf->ds; -  p->tf->eflags = FL_IF; -  p->tf->esp = PGSIZE; -  p->tf->eip = 0;  // beginning of initcode.S +  p->tf->r11 = FL_IF; +  p->tf->rsp = PGSIZE; +  p->tf->rcx = 0;  // beginning of initcode.S    safestrcpy(p->name, "initcode", sizeof(p->name));    p->cwd = namei("/"); @@ -201,7 +198,7 @@ fork(void)    *np->tf = *curproc->tf;    // Clear %eax so that fork returns 0 in the child. -  np->tf->eax = 0; +  np->tf->rax = 0;    for(i = 0; i < NOFILE; i++)      if(curproc->ofile[i]) @@ -289,8 +286,8 @@ wait(void)          pid = p->pid;          kfree(p->kstack);          p->kstack = 0; -        freevm(p->pgdir); -        p->pid = 0; +        freevm(p->pgdir, p->sz); +         p->pid = 0;          p->parent = 0;          p->name[0] = 0;          p->killed = 0; @@ -339,6 +336,7 @@ scheduler(void)        // Switch to chosen process.  It is the process's job        // to release ptable.lock and then reacquire it        // before jumping back to us. +        c->proc = p;        switchuvm(p);        p->state = RUNNING; @@ -408,7 +406,7 @@ forkret(void)      iinit(ROOTDEV);      initlog(ROOTDEV);    } - +      // Return to "caller", actually trapret (see allocproc).  } @@ -514,7 +512,7 @@ procdump(void)    int i;    struct proc *p;    char *state; -  uint pc[10]; +  uint64 pc[10];    for(p = ptable.proc; p < &ptable.proc[NPROC]; p++){      if(p->state == UNUSED) @@ -525,7 +523,7 @@ procdump(void)        state = "???";      cprintf("%d %s %s", p->pid, state, p->name);      if(p->state == SLEEPING){ -      getcallerpcs((uint*)p->context->ebp+2, pc); +      getcallerpcs((uint64*)p->context->ebp+2, pc);        for(i=0; i<10 && pc[i] != 0; i++)          cprintf(" %p", pc[i]);      } @@ -1,5 +1,8 @@  // Per-CPU state  struct cpu { +  uint64 syscallno;            // Temporary used by sysentry +  uint64 usp;                  // Temporary used by sysentry +  struct proc *proc;           // The process running on this cpu or null    uchar apicid;                // Local APIC ID    struct context *scheduler;   // swtch() here to enter scheduler    struct taskstate ts;         // Used by x86 to find stack for interrupt @@ -7,7 +10,6 @@ struct cpu {    volatile uint started;       // Has the CPU started?    int ncli;                    // Depth of pushcli nesting.    int intena;                  // Were interrupts enabled before pushcli? -  struct proc *proc;           // The process running on this cpu or null  };  extern struct cpu cpus[NCPU]; @@ -25,20 +27,23 @@ extern int ncpu;  // at the "Switch stacks" comment. Switch doesn't save eip explicitly,  // but it is on the stack and allocproc() manipulates it.  struct context { -  uint edi; -  uint esi; -  uint ebx; -  uint ebp; -  uint eip; +  uint64 r15; +  uint64 r14; +  uint64 r13; +  uint64 r12; +  uint64 r11; +  uint64 rbx; +  uint64 ebp; //rbp +  uint64 eip; //rip;  };  enum procstate { UNUSED, EMBRYO, SLEEPING, RUNNABLE, RUNNING, ZOMBIE };  // Per-process state  struct proc { -  uint sz;                     // Size of process memory (bytes) +  char *kstack;                // Bottom of kernel stack for this process, must be first entry +  uint64 sz;                   // Size of process memory (bytes)    pde_t* pgdir;                // Page table -  char *kstack;                // Bottom of kernel stack for this process    enum procstate state;        // Process state    int pid;                     // Process ID    struct proc *parent;         // Parent process @@ -69,17 +69,17 @@ release(struct spinlock *lk)  // Record the current call stack in pcs[] by following the %ebp chain.  void -getcallerpcs(void *v, uint pcs[]) +getcallerpcs(void *v, uint64 pcs[])  { -  uint *ebp; +  uint64 *ebp;    int i; -  ebp = (uint*)v - 2; +  asm volatile("mov %%rbp, %0" : "=r" (ebp));    for(i = 0; i < 10; i++){ -    if(ebp == 0 || ebp < (uint*)KERNBASE || ebp == (uint*)0xffffffff) +    if(ebp == 0 || ebp < (uint64*)KERNBASE || ebp == (uint64*)0xffffffff)        break;      pcs[i] = ebp[1];     // saved %eip -    ebp = (uint*)ebp[0]; // saved %ebp +    ebp = (uint64*)ebp[0]; // saved %ebp    }    for(; i < 10; i++)      pcs[i] = 0; @@ -5,7 +5,7 @@ struct spinlock {    // For debugging:    char *name;        // Name of lock.    struct cpu *cpu;   // The cpu holding the lock. -  uint pcs[10];      // The call stack (an array of program counters) +  uint64 pcs[10];      // The call stack (an array of program counters)                       // that locked the lock.  }; @@ -4,7 +4,7 @@  void*  memset(void *dst, int c, uint n)  { -  if ((int)dst%4 == 0 && n%4 == 0){ +  if ((uint64)dst%4 == 0 && n%4 == 0){      c &= 0xFF;      stosl(dst, (c<<24)|(c<<16)|(c<<8)|c, n/4);    } else @@ -8,22 +8,28 @@  .globl swtch  swtch: -  movl 4(%esp), %eax -  movl 8(%esp), %edx - -  # Save old callee-saved registers -  pushl %ebp -  pushl %ebx -  pushl %esi -  pushl %edi +  # Save old callee-save registers +  push %rbp +  push %rbx +  push %r11 +  push %r12 +  push %r13 +  push %r14 +  push %r15    # Switch stacks -  movl %esp, (%eax) -  movl %edx, %esp +  mov %rsp, (%rdi)   # first arg is in rdi +  mov %rsi, %rsp     # second arg is in rsi + +  # Load new callee-save registers +  pop %r15 +  pop %r14 +  pop %r13 +  pop %r12 +  pop %r11 +  pop %rbx +  pop %rbp -  # Load new callee-saved registers -  popl %edi -  popl %esi -  popl %ebx -  popl %ebp    ret + +	 @@ -15,13 +15,13 @@  // Fetch the int at addr from the current process.  int -fetchint(uint addr, int *ip) +fetchint(uint64 addr, int *ip)  {    struct proc *curproc = myproc();    if(addr >= curproc->sz || addr+4 > curproc->sz)      return -1; -  *ip = *(int*)(addr); +  *ip = *(uint64*)(addr);    return 0;  } @@ -29,7 +29,7 @@ fetchint(uint addr, int *ip)  // Doesn't actually copy the string - just sets *pp to point at it.  // Returns length of string, not including nul.  int -fetchstr(uint addr, char **pp) +fetchstr(uint64 addr, char **pp)  {    char *s, *ep;    struct proc *curproc = myproc(); @@ -45,11 +45,51 @@ fetchstr(uint addr, char **pp)    return -1;  } +static uint64 +fetcharg(int n) +{ +  struct proc *curproc = myproc(); +  switch (n) { +  case 0: +    return curproc->tf->rdi; +  case 1: +    return curproc->tf->rsi; +  case 2: +    return curproc->tf->rdx; +  case 3: +    return curproc->tf->r10; +  case 4: +    return curproc->tf->r8; +  case 5: +    return curproc->tf->r9; +  } +  panic("fetcharg"); +  return -1; +} + +int +fetchaddr(uint64 addr, uint64 *ip) +{ +  struct proc *curproc = myproc(); +  if(addr >= curproc->sz || addr+sizeof(uint64) > curproc->sz) +    return -1; +  *ip = *(uint64*)(addr); +  return 0; +} +  // Fetch the nth 32-bit system call argument.  int  argint(int n, int *ip)  { -  return fetchint((myproc()->tf->esp) + 4 + 4*n, ip); +  *ip = fetcharg(n); +  return 0; +} + +int +argaddr(int n, uint64 *ip) +{ +  *ip = fetcharg(n); +  return 0;  }  // Fetch the nth word-sized system call argument as a pointer @@ -58,10 +98,10 @@ argint(int n, int *ip)  int  argptr(int n, char **pp, int size)  { -  int i; +  uint64 i;    struct proc *curproc = myproc(); -  if(argint(n, &i) < 0) +  if(argaddr(n, &i) < 0)      return -1;    if(size < 0 || (uint)i >= curproc->sz || (uint)i+size > curproc->sz)      return -1; @@ -134,12 +174,12 @@ syscall(void)    int num;    struct proc *curproc = myproc(); -  num = curproc->tf->eax; +  num = curproc->tf->rax;    if(num > 0 && num < NELEM(syscalls) && syscalls[num]) { -    curproc->tf->eax = syscalls[num](); +    curproc->tf->rax = syscalls[num]();    } else {      cprintf("%d %s: unknown sys call %d\n",              curproc->pid, curproc->name, num); -    curproc->tf->eax = -1; +    curproc->tf->rax = -1;    }  } @@ -399,16 +399,16 @@ sys_exec(void)  {    char *path, *argv[MAXARG];    int i; -  uint uargv, uarg; +  uint64 uargv, uarg; -  if(argstr(0, &path) < 0 || argint(1, (int*)&uargv) < 0){ +  if(argstr(0, &path) < 0 || argaddr(1, &uargv) < 0){      return -1;    }    memset(argv, 0, sizeof(argv));    for(i=0;; i++){      if(i >= NELEM(argv))        return -1; -    if(fetchint(uargv+4*i, (int*)&uarg) < 0) +    if(fetchaddr(uargv+sizeof(uint64)*i, (uint64*)&uarg) < 0)        return -1;      if(uarg == 0){        argv[i] = 0; @@ -9,8 +9,8 @@  #include "spinlock.h"  // Interrupt descriptor table (shared by all CPUs). -struct gatedesc idt[256]; -extern uint vectors[];  // in vectors.S: array of 256 entry pointers +struct intgate idt[256]; +extern uint64 vectors[];  // in vectors.S: array of 256 entry pointers  struct spinlock tickslock;  uint ticks; @@ -19,17 +19,22 @@ tvinit(void)  {    int i; -  for(i = 0; i < 256; i++) -    SETGATE(idt[i], 0, SEG_KCODE<<3, vectors[i], 0); -  SETGATE(idt[T_SYSCALL], 1, SEG_KCODE<<3, vectors[T_SYSCALL], DPL_USER); - +  for(i=0; i<256; i++) { +    idt[i] = INTDESC(KCSEG, vectors[i], INT_P | SEG_INTR64); +  } +  idtinit(); +        initlock(&tickslock, "time");  }  void  idtinit(void)  { -  lidt(idt, sizeof(idt)); +  struct desctr dtr; + +  dtr.limit = sizeof(idt) - 1; +  dtr.base = (uint64)idt; +  lidt((void *)&dtr.limit);  }  //PAGEBREAK: 41 @@ -74,7 +79,7 @@ trap(struct trapframe *tf)    case T_IRQ0 + 7:    case T_IRQ0 + IRQ_SPURIOUS:      cprintf("cpu%d: spurious interrupt at %x:%x\n", -            cpuid(), tf->cs, tf->eip); +            cpuid(), tf->cs, tf->rip);      lapiceoi();      break; @@ -83,14 +88,14 @@ trap(struct trapframe *tf)      if(myproc() == 0 || (tf->cs&3) == 0){        // In kernel, it must be our mistake.        cprintf("unexpected trap %d from cpu %d eip %x (cr2=0x%x)\n", -              tf->trapno, cpuid(), tf->eip, rcr2()); +              tf->trapno, cpuid(), tf->rip, rcr2());        panic("trap");      }      // In user space, assume process misbehaved.      cprintf("pid %d %s: trap %d err %d on cpu %d "              "eip 0x%x addr 0x%x--kill proc\n",              myproc()->pid, myproc()->name, tf->trapno, -            tf->err, cpuid(), tf->eip, rcr2()); +            tf->err, cpuid(), tf->rip, rcr2());      myproc()->killed = 1;    } @@ -105,8 +110,10 @@ trap(struct trapframe *tf)    if(myproc() && myproc()->state == RUNNING &&       tf->trapno == T_IRQ0+IRQ_TIMER)      yield(); - +      // Check if the process has been killed since we yielded    if(myproc() && myproc()->killed && (tf->cs&3) == DPL_USER)      exit();  } + + @@ -1,32 +1,136 @@ +#include "param.h" +#include "x86.h"	  #include "mmu.h" - -  # vectors.S sends all traps here. +	 +# vectors.S sends all traps here.  .globl alltraps  alltraps:    # Build trap frame. -  pushl %ds -  pushl %es -  pushl %fs -  pushl %gs -  pushal -   -  # Set up data segments. -  movw $(SEG_KDATA<<3), %ax -  movw %ax, %ds -  movw %ax, %es +  push %r15 +  push %r14 +  push %r13 +  push %r12 +  push %r11 +  push %r10 +  push %r9 +  push %r8 +  push %rdi +  push %rsi +  push %rbp +  push %rdx +  push %rcx +  push %rbx +  push %rax -  # Call trap(tf), where tf=%esp -  pushl %esp +  cmpw $KCSEG, 32(%rsp)   # compare to saved cs +  jz 1f  +  swapgs +   +1:mov  %rsp, %rdi  # frame in arg1    call trap -  addl $4, %esp -  # Return falls through to trapret... +# Return falls through to trapret...  .globl trapret  trapret: -  popal -  popl %gs -  popl %fs -  popl %es -  popl %ds -  addl $0x8, %esp  # trapno and errcode -  iret +  cli +  cmpw $KCSEG, 32(%rsp)  # compare to saved cs +  jz 1f +  swapgs + +1:pop %rax +  pop %rbx +  pop %rcx +  pop %rdx +  pop %rbp +  pop %rsi +  pop %rdi +  pop %r8 +  pop %r9 +  pop %r10 +  pop %r11 +  pop %r12 +  pop %r13 +  pop %r14 +  pop %r15 + +  add $16, %rsp  # discard trapnum and errorcode +  iretq +#PAGEBREAK! + +# syscall_entry jumps here after syscall instruction +.globl sysentry +sysentry:  # Build trap frame. +  // load kernel stack address +  swapgs +  movq  %rax, %gs:0  // save %rax in syscallno of cpu entry +  movq  %rsp, %gs:8  // user sp +  movq  %gs:16, %rax  // proc entry +   +  movq  %ss:0(%rax), %rax // load kstack from proc +  addq  $(KSTACKSIZE), %rax + +  movq  %rax, %rsp +  movq  %gs:0, %rax  // restore rax + +  // push usp +  push $0 +  push %gs:8 +  // safe eflags and eip +  push %r11 +  push $UCSEG +  push %rcx +  // push errno and trapno to make stack look like a trap +  push $0 +  push $64 + +  // push values on kernel stack +  push %r15 +  push %r14 +  push %r13 +  push %r12 +  push %r11 +  push %r10 +  push %r9 +  push %r8 +  push %rdi +  push %rsi +  push %rbp +  push %rdx +  push %rcx +  push %rbx +  push %rax + +  mov  %rsp, %rdi  # frame in arg1 + +  call trap +#PAGEBREAK! + +# Return falls through to trapret... +.globl sysexit +sysexit: +  # to make sure we don't get any interrupts on the user stack while in +  # supervisor mode.  insufficient?  (see vunerability reports for sysret) +  cli +   +  pop %rax +  pop %rbx +  pop %rcx +  pop %rdx +  pop %rbp +  pop %rsi +  pop %rdi +  pop %r8 +  pop %r9 +  pop %r10 +  pop %r11 +  pop %r12 +  pop %r13 +  pop %r14 +  pop %r15 + +  add $(5*8), %rsp  # discard trapnum, errorcode, rip, cs and rflags +  mov (%rsp),%rsp  # switch to the user stack +  swapgs	 + +  sysretq + @@ -36,3 +36,4 @@  #define IRQ_ERROR       19  #define IRQ_SPURIOUS    31 + @@ -1,4 +1,10 @@  typedef unsigned int   uint;  typedef unsigned short ushort;  typedef unsigned char  uchar; -typedef uint pde_t; + +typedef unsigned char uint8; +typedef unsigned short uint16; +typedef unsigned int  uint32; +typedef unsigned long uint64; + +typedef uint64 pde_t; diff --git a/usertests.c b/usertests.c index a1e97e7..07d10d4 100644 --- a/usertests.c +++ b/usertests.c @@ -363,17 +363,29 @@ preempt(void)    printf(1, "preempt: ");    pid1 = fork(); +  if(pid1 < 0) { +    printf(1, "fork failed"); +    exit(); +  }    if(pid1 == 0)      for(;;)        ;    pid2 = fork(); +  if(pid2 < 0) { +    printf(1, "fork failed\n"); +    exit(); +  }    if(pid2 == 0)      for(;;)        ;    pipe(pfds);    pid3 = fork(); +  if(pid3 < 0) { +     printf(1, "fork failed\n"); +     exit(); +  }    if(pid3 == 0){      close(pfds[0]);      if(write(pfds[1], "x", 1) != 1) @@ -1391,6 +1403,11 @@ forktest(void)        exit();    } +  if (n == 0) { +    printf(1, "no fork at all!\n"); +    exit(); +  } +    if(n == 1000){      printf(1, "fork claimed to work 1000 times!\n");      exit(); @@ -1414,16 +1431,16 @@ forktest(void)  void  sbrktest(void)  { -  int fds[2], pid, pids[10], ppid; -  char *a, *b, *c, *lastaddr, *oldbrk, *p, scratch; -  uint amt; +  int i, fds[2], pids[10], pid, ppid; +  char *c, *oldbrk, scratch, *a, *b, *lastaddr, *p; +  uint64 amt; +  #define BIG (100*1024*1024)    printf(stdout, "sbrk test\n");    oldbrk = sbrk(0);    // can one sbrk() less than a page?    a = sbrk(0); -  int i;    for(i = 0; i < 5000; i++){      b = sbrk(1);      if(b != a){ @@ -1449,9 +1466,8 @@ sbrktest(void)    wait();    // can one grow address space to something big? -#define BIG (100*1024*1024)    a = sbrk(0); -  amt = (BIG) - (uint)a; +  amt = (BIG) - (uint64)a;    p = sbrk(amt);    if (p != a) {      printf(stdout, "sbrk test failed to grow big address space; enough phys mem?\n"); @@ -1508,7 +1524,7 @@ sbrktest(void)      }      wait();    } - +        // if we run the system out of memory, does it clean up the last    // failed allocation?    if(pipe(fds) != 0){ @@ -1518,7 +1534,7 @@ sbrktest(void)    for(i = 0; i < sizeof(pids)/sizeof(pids[0]); i++){      if((pids[i] = fork()) == 0){        // allocate a lot of memory -      sbrk(BIG - (uint)sbrk(0)); +      sbrk(BIG - (uint64)sbrk(0));        write(fds[1], "x", 1);        // sit around until killed        for(;;) sleep(1000); @@ -1526,6 +1542,7 @@ sbrktest(void)      if(pids[i] != -1)        read(fds[0], &scratch, 1);    } +    // if those failed allocations freed up the pages they did allocate,    // we'll be able to allocate here    c = sbrk(4096); @@ -1549,7 +1566,7 @@ sbrktest(void)  void  validateint(int *p)  { -  int res; +  /* XXX int res;    asm("mov %%esp, %%ebx\n\t"        "mov %3, %%esp\n\t"        "int %2\n\t" @@ -1557,13 +1574,14 @@ validateint(int *p)        "=a" (res) :        "a" (SYS_sleep), "n" (T_SYSCALL), "c" (p) :        "ebx"); +  */  }  void  validatetest(void)  {    int hi, pid; -  uint p; +  uint64 p;    printf(stdout, "validate test\n");    hi = 1100*1024; @@ -5,7 +5,7 @@    .globl name; \    name: \      movl $SYS_ ## name, %eax; \ -    int $T_SYSCALL; \ +    syscall; \      ret  SYSCALL(fork) @@ -12,9 +12,9 @@ for(my $i = 0; $i < 256; $i++){      print ".globl vector$i\n";      print "vector$i:\n";      if(!($i == 8 || ($i >= 10 && $i <= 14) || $i == 17)){ -        print "  pushl \$0\n"; +        print "  push \$0\n";      } -    print "  pushl \$$i\n"; +    print "  push \$$i\n";      print "  jmp alltraps\n";  } @@ -23,7 +23,7 @@ print ".data\n";  print ".globl vectors\n";  print "vectors:\n";  for(my $i = 0; $i < 256; $i++){ -    print "  .long vector$i\n"; +    print "  .quad vector$i\n";  }  # sample output: @@ -31,8 +31,8 @@ for(my $i = 0; $i < 256; $i++){  #   .globl alltraps  #   .globl vector0  #   vector0: -#     pushl $0 -#     pushl $0 +#     push $0 +#     push $0  #     jmp alltraps  #   ...  #    @@ -40,8 +40,8 @@ for(my $i = 0; $i < 256; $i++){  #   .data  #   .globl vectors  #   vectors: -#     .long vector0 -#     .long vector1 -#     .long vector2 +#     .quad vector0 +#     .quad vector1 +#     .quad vector2  #   ... @@ -2,13 +2,34 @@  #include "types.h"  #include "defs.h"  #include "x86.h" +#include "msr.h"  #include "memlayout.h"  #include "mmu.h"  #include "proc.h"  #include "elf.h" +#include "traps.h"  extern char data[];  // defined by kernel.ld -pde_t *kpgdir;  // for use in scheduler() +void sysentry(void); + +static pde_t *kpml4; // kernel address space, used by scheduler and bootup + +// Bootstrap GDT.  Used by boot.S but defined in C +// Map "logical" addresses to virtual addresses using identity map. +// Cannot share a CODE descriptor for both kernel and user +// because it would have to have DPL_USR, but the CPU forbids +// an interrupt from CPL=0 to DPL=3. +struct segdesc bootgdt[NSEGS] = { +  [0] = SEGDESC(0, 0, 0),  // null +  [1] = SEGDESC(0, 0xfffff, SEG_R|SEG_CODE|SEG_S|SEG_DPL(0)|SEG_P|SEG_D|SEG_G),  // 32-bit kernel code +  [2] = SEGDESC(0, 0, SEG_R|SEG_CODE|SEG_S|SEG_DPL(0)|SEG_P|SEG_L|SEG_G),  // 64-bit kernel code +  [3] = SEGDESC(0, 0xfffff, SEG_W|SEG_S|SEG_DPL(0)|SEG_P|SEG_D|SEG_G),       // kernel data +  // The order of the user data and user code segments is +  // important for syscall instructions.  See initseg. +  [6] = SEGDESC(0, 0xfffff, SEG_W|SEG_S|SEG_DPL(3)|SEG_P|SEG_D|SEG_G),   // 64-bit user data +  [7] = SEGDESC(0, 0, SEG_R|SEG_CODE|SEG_S|SEG_DPL(3)|SEG_P|SEG_L|SEG_G),    // 64-bit user code +}; +  // Set up CPU's kernel segment descriptors.  // Run once on entry on each CPU. @@ -16,41 +37,82 @@ void  seginit(void)  {    struct cpu *c; - -  // Map "logical" addresses to virtual addresses using identity map. -  // Cannot share a CODE descriptor for both kernel and user -  // because it would have to have DPL_USR, but the CPU forbids -  // an interrupt from CPL=0 to DPL=3. -  c = &cpus[cpuid()]; -  c->gdt[SEG_KCODE] = SEG(STA_X|STA_R, 0, 0xffffffff, 0); -  c->gdt[SEG_KDATA] = SEG(STA_W, 0, 0xffffffff, 0); -  c->gdt[SEG_UCODE] = SEG(STA_X|STA_R, 0, 0xffffffff, DPL_USER); -  c->gdt[SEG_UDATA] = SEG(STA_W, 0, 0xffffffff, DPL_USER); -  lgdt(c->gdt, sizeof(c->gdt)); +  struct desctr dtr; + +  c = mycpu(); +  memmove(c->gdt, bootgdt, sizeof bootgdt); +  dtr.limit = sizeof(c->gdt)-1; +  dtr.base = (uint64) c->gdt; +  lgdt((void *)&dtr.limit); + +  // When executing a syscall instruction the CPU sets the SS selector +  // to (star >> 32) + 8 and the CS selector to (star >> 32). +  // When executing a sysret instruction the CPU sets the SS selector +  // to (star >> 48) + 8 and the CS selector to (star >> 48) + 16. +  uint64 star = ((((uint64)UCSEG|0x3)- 16)<<48)|((uint64)(KCSEG)<<32); +  writemsr(MSR_STAR, star); +  writemsr(MSR_LSTAR, (uint64)&sysentry); +  writemsr(MSR_SFMASK, FL_TF | FL_IF); + +    // Initialize cpu-local storage. +  writegs(KDSEG); +  writemsr(MSR_GS_BASE, (uint64)c); +  writemsr(MSR_GS_KERNBASE, (uint64)c);  }  // Return the address of the PTE in page table pgdir  // that corresponds to virtual address va.  If alloc!=0,  // create any required page table pages.  static pte_t * -walkpgdir(pde_t *pgdir, const void *va, int alloc) +walkpgdir(pde_t *pml4, const void *va, int alloc)  { +  pml4e_t *pml4e; +  pdpe_t *pdp; +  pdpe_t *pdpe;    pde_t *pde; +  pde_t *pd;    pte_t *pgtab; -  pde = &pgdir[PDX(va)]; -  if(*pde & PTE_P){ -    pgtab = (pte_t*)P2V(PTE_ADDR(*pde)); -  } else { -    if(!alloc || (pgtab = (pte_t*)kalloc()) == 0) +  // level 4 +  pml4e = &pml4[PMX(va)]; +  if(*pml4e & PTE_P) +    pdp = (pdpe_t*)P2V(PTE_ADDR(*pml4e));   +  else { +    if(!alloc || (pdp = (pdpe_t*)kalloc()) == 0)        return 0;      // Make sure all those PTE_P bits are zero. -    memset(pgtab, 0, PGSIZE); +    memset(pdp, 0, PGSIZE);      // The permissions here are overly generous, but they can      // be further restricted by the permissions in the page table      // entries, if necessary. +    *pml4e = V2P(pdp) | PTE_P | PTE_W | PTE_U; +  } + +  // XXX avoid repetition + +  // level 3 +  pdpe = &pdp[PDPX(va)];   +  if(*pdpe & PTE_P)  +    pd = (pde_t*)P2V(PTE_ADDR(*pdpe)); +  else { +    if(!alloc || (pd = (pde_t*)kalloc()) == 0) +      return 0; +    memset(pd, 0, PGSIZE); +    *pdpe = V2P(pd) | PTE_P | PTE_W | PTE_U; +  } + +  // level 2 +  pde = &pd[PDX(va)];  +  if(*pde & PTE_P) +    pgtab = (pte_t*)P2V(PTE_ADDR(*pde)); +  else { +    if(!alloc || (pgtab = (pte_t*)kalloc()) == 0) +      return 0; +    memset(pgtab, 0, PGSIZE);      *pde = V2P(pgtab) | PTE_P | PTE_W | PTE_U;    } + +  // level 1    return &pgtab[PTX(va)];  } @@ -58,13 +120,13 @@ walkpgdir(pde_t *pgdir, const void *va, int alloc)  // physical addresses starting at pa. va and size might not  // be page-aligned.  static int -mappages(pde_t *pgdir, void *va, uint size, uint pa, int perm) +mappages(pde_t *pgdir, void *va, uint64 size, uint64 pa, int perm)  {    char *a, *last;    pte_t *pte; -  a = (char*)PGROUNDDOWN((uint)va); -  last = (char*)PGROUNDDOWN(((uint)va) + size - 1); +  a = (char*)PGROUNDDOWN((uint64)va); +  last = (char*)PGROUNDDOWN(((uint64)va) + size - 1);    for(;;){      if((pte = walkpgdir(pgdir, a, 1)) == 0)        return -1; @@ -80,7 +142,7 @@ mappages(pde_t *pgdir, void *va, uint size, uint pa, int perm)  }  // There is one page table per process, plus one that's used when -// a CPU is not running any process (kpgdir). The kernel uses the +// a CPU is not running any process (kpml4). The kernel uses the  // current process's page table during system calls and interrupts;  // page protection bits prevent user code from using the kernel's  // mappings. @@ -104,35 +166,36 @@ mappages(pde_t *pgdir, void *va, uint size, uint pa, int perm)  // every process's page table.  static struct kmap {    void *virt; -  uint phys_start; -  uint phys_end; +  uint64 phys_start; +  uint64 phys_end;    int perm;  } kmap[] = {   { (void*)KERNBASE, 0,             EXTMEM,    PTE_W}, // I/O space   { (void*)KERNLINK, V2P(KERNLINK), V2P(data), 0},     // kern text+rodata   { (void*)data,     V2P(data),     PHYSTOP,   PTE_W}, // kern data+memory - { (void*)DEVSPACE, DEVSPACE,      0,         PTE_W}, // more devices + { (void*)P2V(DEVSPACE), DEVSPACE, DEVSPACETOP, PTE_W}, // more devices  };  // Set up kernel part of a page table.  pde_t*  setupkvm(void)  { -  pde_t *pgdir; +  pde_t *pml4;    struct kmap *k; -  if((pgdir = (pde_t*)kalloc()) == 0) +  if((pml4 = (pde_t*)kalloc()) == 0)      return 0; -  memset(pgdir, 0, PGSIZE); -  if (P2V(PHYSTOP) > (void*)DEVSPACE) +  memset(pml4, 0, PGSIZE); +  if (PHYSTOP > DEVSPACE)      panic("PHYSTOP too high"); -  for(k = kmap; k < &kmap[NELEM(kmap)]; k++) -    if(mappages(pgdir, k->virt, k->phys_end - k->phys_start, +  for(k = kmap; k < &kmap[NELEM(kmap)]; k++) { +    if(mappages(pml4, k->virt, k->phys_end - k->phys_start,                  (uint)k->phys_start, k->perm) < 0) { -      freevm(pgdir); +      freevm(pml4, 0);        return 0;      } -  return pgdir; +  } +  return pml4;  }  // Allocate one page table for the machine for the kernel address @@ -140,7 +203,7 @@ setupkvm(void)  void  kvmalloc(void)  { -  kpgdir = setupkvm(); +  kpml4 = setupkvm();    switchkvm();  } @@ -149,13 +212,17 @@ kvmalloc(void)  void  switchkvm(void)  { -  lcr3(V2P(kpgdir));   // switch to the kernel page table +  lcr3(V2P(kpml4));   // switch to the kernel page table  } +  // Switch TSS and h/w page table to correspond to process p.  void  switchuvm(struct proc *p)  { +  struct desctr dtr; +  struct cpu *c; +      if(p == 0)      panic("switchuvm: no process");    if(p->kstack == 0) @@ -164,16 +231,22 @@ switchuvm(struct proc *p)      panic("switchuvm: no pgdir");    pushcli(); -  mycpu()->gdt[SEG_TSS] = SEG16(STS_T32A, &mycpu()->ts, -                                sizeof(mycpu()->ts)-1, 0); -  mycpu()->gdt[SEG_TSS].s = 0; -  mycpu()->ts.ss0 = SEG_KDATA << 3; -  mycpu()->ts.esp0 = (uint)p->kstack + KSTACKSIZE; -  // setting IOPL=0 in eflags *and* iomb beyond the tss segment limit -  // forbids I/O instructions (e.g., inb and outb) from user space -  mycpu()->ts.iomb = (ushort) 0xFFFF; -  ltr(SEG_TSS << 3); + +  c = mycpu(); +  uint64 base = (uint64) &(c->ts); +  c->gdt[TSSSEG>>3] =  SEGDESC(base, (sizeof(c->ts)-1), SEG_P|SEG_TSS64A); +  c->gdt[(TSSSEG>>3)+1] = SEGDESCHI(base); +  c->ts.rsp[0] = (uint64) p->kstack + KSTACKSIZE; +  c->ts.iomba = (ushort) 0xFFFF; + +  dtr.limit = sizeof(c->gdt) - 1; +  dtr.base = (uint64)c->gdt; +  lgdt((void *)&dtr.limit); + +  ltr(TSSSEG); +    lcr3(V2P(p->pgdir));  // switch to process's address space +    popcli();  } @@ -197,10 +270,11 @@ inituvm(pde_t *pgdir, char *init, uint sz)  int  loaduvm(pde_t *pgdir, char *addr, struct inode *ip, uint offset, uint sz)  { -  uint i, pa, n; +  uint i, n; +  uint64 pa;    pte_t *pte; -  if((uint) addr % PGSIZE != 0) +  if((uint64) addr % PGSIZE != 0)      panic("loaduvm: addr must be page aligned");    for(i = 0; i < sz; i += PGSIZE){      if((pte = walkpgdir(pgdir, addr+i, 0)) == 0) @@ -222,7 +296,7 @@ int  allocuvm(pde_t *pgdir, uint oldsz, uint newsz)  {    char *mem; -  uint a; +  uint64 a;    if(newsz >= KERNBASE)      return 0; @@ -233,13 +307,11 @@ allocuvm(pde_t *pgdir, uint oldsz, uint newsz)    for(; a < newsz; a += PGSIZE){      mem = kalloc();      if(mem == 0){ -      cprintf("allocuvm out of memory\n");        deallocuvm(pgdir, newsz, oldsz);        return 0;      }      memset(mem, 0, PGSIZE);      if(mappages(pgdir, (char*)a, PGSIZE, V2P(mem), PTE_W|PTE_U) < 0){ -      cprintf("allocuvm out of memory (2)\n");        deallocuvm(pgdir, newsz, oldsz);        kfree(mem);        return 0; @@ -253,10 +325,10 @@ allocuvm(pde_t *pgdir, uint oldsz, uint newsz)  // need to be less than oldsz.  oldsz can be larger than the actual  // process size.  Returns the new process size.  int -deallocuvm(pde_t *pgdir, uint oldsz, uint newsz) +deallocuvm(pde_t *pgdir, uint64 oldsz, uint64 newsz)  {    pte_t *pte; -  uint a, pa; +  uint64 a, pa;    if(newsz >= oldsz)      return oldsz; @@ -281,20 +353,34 @@ deallocuvm(pde_t *pgdir, uint oldsz, uint newsz)  // Free a page table and all the physical memory pages  // in the user part.  void -freevm(pde_t *pgdir) +freevm(pde_t *pml4, uint64 sz)  { -  uint i; +  uint i, j, k; +  pde_t *pdp, *pd, *pt; -  if(pgdir == 0) +  if(pml4 == 0)      panic("freevm: no pgdir"); -  deallocuvm(pgdir, KERNBASE, 0); + +  deallocuvm(pml4, sz, 0);    for(i = 0; i < NPDENTRIES; i++){ -    if(pgdir[i] & PTE_P){ -      char * v = P2V(PTE_ADDR(pgdir[i])); -      kfree(v); +    if(pml4[i] & PTE_P){ +      pdp = (pdpe_t*)P2V(PTE_ADDR(pml4[i])); +      for(j = 0; j < NPDENTRIES; j++){ +        if(pdp[j] & PTE_P){ +          pd = (pde_t*)P2V(PTE_ADDR(pdp[j])); +          for(k = 0; k < NPDENTRIES; k++){ +            if(pd[k] & PTE_P) { +              pt = (pde_t*)P2V(PTE_ADDR(pd[k])); +              kfree((char*)pt); +            } +          } +          kfree((char*)pd); +        } +      } +      kfree((char*)pdp);      }    } -  kfree((char*)pgdir); +  kfree((char*)pml4);  }  // Clear PTE_U on a page. Used to create an inaccessible @@ -317,7 +403,8 @@ copyuvm(pde_t *pgdir, uint sz)  {    pde_t *d;    pte_t *pte; -  uint pa, i, flags; +  uint64 pa, i; +  uint flags;    char *mem;    if((d = setupkvm()) == 0) @@ -340,7 +427,7 @@ copyuvm(pde_t *pgdir, uint sz)    return d;  bad: -  freevm(d); +  freevm(d, sz);    return 0;  } @@ -366,7 +453,7 @@ int  copyout(pde_t *pgdir, uint va, void *p, uint len)  {    char *buf, *pa0; -  uint n, va0; +  uint64 n, va0;    buf = (char*)p;    while(len > 0){ @@ -1,5 +1,7 @@  // Routines to let C code use special x86 instructions. +#ifndef __ASSEMBLER__ +  static inline uchar  inb(ushort port)  { @@ -57,32 +59,16 @@ stosl(void *addr, int data, int cnt)                 "memory", "cc");  } -struct segdesc; -  static inline void -lgdt(struct segdesc *p, int size) +lgdt(void *p)  { -  volatile ushort pd[3]; - -  pd[0] = size-1; -  pd[1] = (uint)p; -  pd[2] = (uint)p >> 16; - -  asm volatile("lgdt (%0)" : : "r" (pd)); +  asm volatile("lgdt (%0)" : : "r" (p) : "memory");  } -struct gatedesc; -  static inline void -lidt(struct gatedesc *p, int size) +lidt(void *p)  { -  volatile ushort pd[3]; - -  pd[0] = size-1; -  pd[1] = (uint)p; -  pd[2] = (uint)p >> 16; - -  asm volatile("lidt (%0)" : : "r" (pd)); +  asm volatile("lidt (%0)" : : "r" (p) : "memory");  }  static inline void @@ -91,11 +77,11 @@ ltr(ushort sel)    asm volatile("ltr %0" : : "r" (sel));  } -static inline uint +static inline uint64  readeflags(void)  { -  uint eflags; -  asm volatile("pushfl; popl %0" : "=r" (eflags)); +  uint64 eflags; +  asm volatile("pushf; pop %0" : "=r" (eflags));    return eflags;  } @@ -133,51 +119,53 @@ xchg(volatile uint *addr, uint newval)  static inline uint  rcr2(void)  { -  uint val; -  asm volatile("movl %%cr2,%0" : "=r" (val)); +  uint64 val; +  asm volatile("mov %%cr2,%0" : "=r" (val));    return val;  }  static inline void -lcr3(uint val) +lcr3(uint64 val) +{ +  asm volatile("mov %0,%%cr3" : : "r" (val)); +} + +static inline void +writegs(uint16 v)  { -  asm volatile("movl %0,%%cr3" : : "r" (val)); +  __asm volatile("movw %0, %%gs" : : "r" (v));  } +  //PAGEBREAK: 36  // Layout of the trap frame built on the stack by the  // hardware and by trapasm.S, and passed to trap().  struct trapframe { -  // registers as pushed by pusha -  uint edi; -  uint esi; -  uint ebp; -  uint oesp;      // useless & ignored -  uint ebx; -  uint edx; -  uint ecx; -  uint eax; - -  // rest of trap frame -  ushort gs; -  ushort padding1; -  ushort fs; -  ushort padding2; -  ushort es; -  ushort padding3; -  ushort ds; -  ushort padding4; -  uint trapno; - -  // below here defined by x86 hardware -  uint err; -  uint eip; -  ushort cs; -  ushort padding5; -  uint eflags; - -  // below here only when crossing rings, such as from user to kernel -  uint esp; -  ushort ss; -  ushort padding6; -}; +   uint64 rax;       +   uint64 rbx; +   uint64 rcx; +   uint64 rdx; +   uint64 rbp; +   uint64 rsi; +   uint64 rdi; +   uint64 r8; +   uint64 r9; +   uint64 r10; +   uint64 r11; +   uint64 r12; +   uint64 r13; +   uint64 r14; +   uint64 r15; +   uint64 trapno; +   uint64 err; +   uint64 rip;      +   uint16 cs; +   uint16 padding[3]; +   uint64 rflags;   +   uint64 rsp;      +   uint64 ss;       +}__attribute__((packed)); + +#endif + +#define TF_CS 144 // offset in trapframe for saved cs | 
