diff options
Diffstat (limited to 'kernel')
46 files changed, 6191 insertions, 0 deletions
| diff --git a/kernel/bio.c b/kernel/bio.c new file mode 100644 index 0000000..a1074f2 --- /dev/null +++ b/kernel/bio.c @@ -0,0 +1,151 @@ +// Buffer cache. +// +// The buffer cache is a linked list of buf structures holding +// cached copies of disk block contents.  Caching disk blocks +// in memory reduces the number of disk reads and also provides +// a synchronization point for disk blocks used by multiple processes. +// +// Interface: +// * To get a buffer for a particular disk block, call bread. +// * After changing buffer data, call bwrite to write it to disk. +// * When done with the buffer, call brelse. +// * Do not use the buffer after calling brelse. +// * Only one process at a time can use a buffer, +//     so do not keep them longer than necessary. + + +#include "types.h" +#include "param.h" +#include "spinlock.h" +#include "sleeplock.h" +#include "riscv.h" +#include "defs.h" +#include "fs.h" +#include "buf.h" + +struct { +  struct spinlock lock; +  struct buf buf[NBUF]; + +  // Linked list of all buffers, through prev/next. +  // head.next is most recently used. +  struct buf head; +} bcache; + +void +binit(void) +{ +  struct buf *b; + +  initlock(&bcache.lock, "bcache"); + +  // Create linked list of buffers +  bcache.head.prev = &bcache.head; +  bcache.head.next = &bcache.head; +  for(b = bcache.buf; b < bcache.buf+NBUF; b++){ +    b->next = bcache.head.next; +    b->prev = &bcache.head; +    initsleeplock(&b->lock, "buffer"); +    bcache.head.next->prev = b; +    bcache.head.next = b; +  } +} + +// Look through buffer cache for block on device dev. +// If not found, allocate a buffer. +// In either case, return locked buffer. +static struct buf* +bget(uint dev, uint blockno) +{ +  struct buf *b; + +  acquire(&bcache.lock); + +  // Is the block already cached? +  for(b = bcache.head.next; b != &bcache.head; b = b->next){ +    if(b->dev == dev && b->blockno == blockno){ +      b->refcnt++; +      release(&bcache.lock); +      acquiresleep(&b->lock); +      return b; +    } +  } + +  // Not cached; recycle an unused buffer. +  for(b = bcache.head.prev; b != &bcache.head; b = b->prev){ +    if(b->refcnt == 0) { +      b->dev = dev; +      b->blockno = blockno; +      b->valid = 0; +      b->refcnt = 1; +      release(&bcache.lock); +      acquiresleep(&b->lock); +      return b; +    } +  } +  panic("bget: no buffers"); +} + +// Return a locked buf with the contents of the indicated block. +struct buf* +bread(uint dev, uint blockno) +{ +  struct buf *b; + +  b = bget(dev, blockno); +  if(!b->valid) { +    virtio_disk_rw(b, 0); +    b->valid = 1; +  } +  return b; +} + +// Write b's contents to disk.  Must be locked. +void +bwrite(struct buf *b) +{ +  if(!holdingsleep(&b->lock)) +    panic("bwrite"); +  virtio_disk_rw(b, 1); +} + +// Release a locked buffer. +// Move to the head of the MRU list. +void +brelse(struct buf *b) +{ +  if(!holdingsleep(&b->lock)) +    panic("brelse"); + +  releasesleep(&b->lock); + +  acquire(&bcache.lock); +  b->refcnt--; +  if (b->refcnt == 0) { +    // no one is waiting for it. +    b->next->prev = b->prev; +    b->prev->next = b->next; +    b->next = bcache.head.next; +    b->prev = &bcache.head; +    bcache.head.next->prev = b; +    bcache.head.next = b; +  } +   +  release(&bcache.lock); +} + +void +bpin(struct buf *b) { +  acquire(&bcache.lock); +  b->refcnt++; +  release(&bcache.lock); +} + +void +bunpin(struct buf *b) { +  acquire(&bcache.lock); +  b->refcnt--; +  release(&bcache.lock); +} + + diff --git a/kernel/buf.h b/kernel/buf.h new file mode 100644 index 0000000..4a3a39d --- /dev/null +++ b/kernel/buf.h @@ -0,0 +1,13 @@ +struct buf { +  int valid;   // has data been read from disk? +  int disk;    // does disk "own" buf? +  uint dev; +  uint blockno; +  struct sleeplock lock; +  uint refcnt; +  struct buf *prev; // LRU cache list +  struct buf *next; +  struct buf *qnext; // disk queue +  uchar data[BSIZE]; +}; + diff --git a/kernel/console.c b/kernel/console.c new file mode 100644 index 0000000..87a83ff --- /dev/null +++ b/kernel/console.c @@ -0,0 +1,199 @@ +// +// Console input and output, to the uart. +// Reads are line at a time. +// Implements special input characters: +//   newline -- end of line +//   control-h -- backspace +//   control-u -- kill line +//   control-d -- end of file +//   control-p -- print process list +// + +#include <stdarg.h> + +#include "types.h" +#include "param.h" +#include "spinlock.h" +#include "sleeplock.h" +#include "fs.h" +#include "file.h" +#include "memlayout.h" +#include "riscv.h" +#include "defs.h" +#include "proc.h" + +#define BACKSPACE 0x100 +#define C(x)  ((x)-'@')  // Control-x + +// +// send one character to the uart. +// +void +consputc(int c) +{ +  extern volatile int panicked; // from printf.c + +  if(panicked){ +    for(;;) +      ; +  } + +  if(c == BACKSPACE){ +    // if the user typed backspace, overwrite with a space. +    uartputc('\b'); uartputc(' '); uartputc('\b'); +  } else { +    uartputc(c); +  } +} + +struct { +  struct spinlock lock; +   +  // input +#define INPUT_BUF 128 +  char buf[INPUT_BUF]; +  uint r;  // Read index +  uint w;  // Write index +  uint e;  // Edit index +} cons; + +// +// user write()s to the console go here. +// +int +consolewrite(int user_src, uint64 src, int n) +{ +  int i; + +  acquire(&cons.lock); +  for(i = 0; i < n; i++){ +    char c; +    if(either_copyin(&c, user_src, src+i, 1) == -1) +      break; +    consputc(c); +  } +  release(&cons.lock); + +  return n; +} + +// +// user read()s from the console go here. +// copy (up to) a whole input line to dst. +// user_dist indicates whether dst is a user +// or kernel address. +// +int +consoleread(int user_dst, uint64 dst, int n) +{ +  uint target; +  int c; +  char cbuf; + +  target = n; +  acquire(&cons.lock); +  while(n > 0){ +    // wait until interrupt handler has put some +    // input into cons.buffer. +    while(cons.r == cons.w){ +      if(myproc()->killed){ +        release(&cons.lock); +        return -1; +      } +      sleep(&cons.r, &cons.lock); +    } + +    c = cons.buf[cons.r++ % INPUT_BUF]; + +    if(c == C('D')){  // end-of-file +      if(n < target){ +        // Save ^D for next time, to make sure +        // caller gets a 0-byte result. +        cons.r--; +      } +      break; +    } + +    // copy the input byte to the user-space buffer. +    cbuf = c; +    if(either_copyout(user_dst, dst, &cbuf, 1) == -1) +      break; + +    dst++; +    --n; + +    if(c == '\n'){ +      // a whole line has arrived, return to +      // the user-level read(). +      break; +    } +  } +  release(&cons.lock); + +  return target - n; +} + +// +// the console input interrupt handler. +// uartintr() calls this for input character. +// do erase/kill processing, append to cons.buf, +// wake up consoleread() if a whole line has arrived. +// +void +consoleintr(int c) +{ +  acquire(&cons.lock); + +  switch(c){ +  case C('P'):  // Print process list. +    procdump(); +    break; +  case C('U'):  // Kill line. +    while(cons.e != cons.w && +          cons.buf[(cons.e-1) % INPUT_BUF] != '\n'){ +      cons.e--; +      consputc(BACKSPACE); +    } +    break; +  case C('H'): // Backspace +  case '\x7f': +    if(cons.e != cons.w){ +      cons.e--; +      consputc(BACKSPACE); +    } +    break; +  default: +    if(c != 0 && cons.e-cons.r < INPUT_BUF){ +      c = (c == '\r') ? '\n' : c; + +      // echo back to the user. +      consputc(c); + +      // store for consumption by consoleread(). +      cons.buf[cons.e++ % INPUT_BUF] = c; + +      if(c == '\n' || c == C('D') || cons.e == cons.r+INPUT_BUF){ +        // wake up consoleread() if a whole line (or end-of-file) +        // has arrived. +        cons.w = cons.e; +        wakeup(&cons.r); +      } +    } +    break; +  } +   +  release(&cons.lock); +} + +void +consoleinit(void) +{ +  initlock(&cons.lock, "cons"); + +  uartinit(); + +  // connect read and write system calls +  // to consoleread and consolewrite. +  devsw[CONSOLE].read = consoleread; +  devsw[CONSOLE].write = consolewrite; +} diff --git a/kernel/date.h b/kernel/date.h new file mode 100644 index 0000000..94aec4b --- /dev/null +++ b/kernel/date.h @@ -0,0 +1,8 @@ +struct rtcdate { +  uint second; +  uint minute; +  uint hour; +  uint day; +  uint month; +  uint year; +}; diff --git a/kernel/defs.h b/kernel/defs.h new file mode 100644 index 0000000..23dcd41 --- /dev/null +++ b/kernel/defs.h @@ -0,0 +1,186 @@ +struct buf; +struct context; +struct file; +struct inode; +struct pipe; +struct proc; +struct spinlock; +struct sleeplock; +struct stat; +struct superblock; + +// bio.c +void            binit(void); +struct buf*     bread(uint, uint); +void            brelse(struct buf*); +void            bwrite(struct buf*); +void            bpin(struct buf*); +void            bunpin(struct buf*); + +// console.c +void            consoleinit(void); +void            consoleintr(int); +void            consputc(int); + +// exec.c +int             exec(char*, char**); + +// file.c +struct file*    filealloc(void); +void            fileclose(struct file*); +struct file*    filedup(struct file*); +void            fileinit(void); +int             fileread(struct file*, uint64, int n); +int             filestat(struct file*, uint64 addr); +int             filewrite(struct file*, uint64, int n); + +// fs.c +void            fsinit(int); +int             dirlink(struct inode*, char*, uint); +struct inode*   dirlookup(struct inode*, char*, uint*); +struct inode*   ialloc(uint, short); +struct inode*   idup(struct inode*); +void            iinit(); +void            ilock(struct inode*); +void            iput(struct inode*); +void            iunlock(struct inode*); +void            iunlockput(struct inode*); +void            iupdate(struct inode*); +int             namecmp(const char*, const char*); +struct inode*   namei(char*); +struct inode*   nameiparent(char*, char*); +int             readi(struct inode*, int, uint64, uint, uint); +void            stati(struct inode*, struct stat*); +int             writei(struct inode*, int, uint64, uint, uint); + +// ramdisk.c +void            ramdiskinit(void); +void            ramdiskintr(void); +void            ramdiskrw(struct buf*); + +// kalloc.c +void*           kalloc(void); +void            kfree(void *); +void            kinit(); + +// log.c +void            initlog(int, struct superblock*); +void            log_write(struct buf*); +void            begin_op(); +void            end_op(); + +// pipe.c +int             pipealloc(struct file**, struct file**); +void            pipeclose(struct pipe*, int); +int             piperead(struct pipe*, uint64, int); +int             pipewrite(struct pipe*, uint64, int); + +// printf.c +void            printf(char*, ...); +void            panic(char*) __attribute__((noreturn)); +void            printfinit(void); + +// proc.c +int             cpuid(void); +void            exit(void); +int             fork(void); +int             growproc(int); +pagetable_t     proc_pagetable(struct proc *); +void            proc_freepagetable(pagetable_t, uint64); +int             kill(int); +struct cpu*     mycpu(void); +struct cpu*     getmycpu(void); +struct proc*    myproc(); +void            procinit(void); +void            scheduler(void) __attribute__((noreturn)); +void            sched(void); +void            setproc(struct proc*); +void            sleep(void*, struct spinlock*); +void            userinit(void); +int             wait(void); +void            wakeup(void*); +void            yield(void); +int             either_copyout(int user_dst, uint64 dst, void *src, uint64 len); +int             either_copyin(void *dst, int user_src, uint64 src, uint64 len); +void            procdump(void); + +// swtch.S +void            swtch(struct context*, struct context*); + +// spinlock.c +void            acquire(struct spinlock*); +int             holding(struct spinlock*); +void            initlock(struct spinlock*, char*); +void            release(struct spinlock*); +void            push_off(void); +void            pop_off(void); + +// sleeplock.c +void            acquiresleep(struct sleeplock*); +void            releasesleep(struct sleeplock*); +int             holdingsleep(struct sleeplock*); +void            initsleeplock(struct sleeplock*, char*); + +// string.c +int             memcmp(const void*, const void*, uint); +void*           memmove(void*, const void*, uint); +void*           memset(void*, int, uint); +char*           safestrcpy(char*, const char*, int); +int             strlen(const char*); +int             strncmp(const char*, const char*, uint); +char*           strncpy(char*, const char*, int); + +// syscall.c +int             argint(int, int*); +int             argstr(int, char*, int); +int             argaddr(int, uint64 *); +int             fetchstr(uint64, char*, int); +int             fetchaddr(uint64, uint64*); +void            syscall(); + +// trap.c +extern uint     ticks; +void            trapinit(void); +void            trapinithart(void); +extern struct spinlock tickslock; +void            usertrapret(void); + +// uart.c +void            uartinit(void); +void            uartintr(void); +void            uartputc(int); +int             uartgetc(void); + +// vm.c +void            kvminit(void); +void            kvminithart(void); +uint64          kvmpa(uint64); +void            kvmmap(uint64, uint64, uint64, int); +int             mappages(pagetable_t, uint64, uint64, uint64, int); +pagetable_t     uvmcreate(void); +void            uvminit(pagetable_t, uchar *, uint); +uint64          uvmalloc(pagetable_t, uint64, uint64); +uint64          uvmdealloc(pagetable_t, uint64, uint64); +int             uvmcopy(pagetable_t, pagetable_t, uint64); +void            uvmfree(pagetable_t, uint64); +void            uvmunmap(pagetable_t, uint64, uint64, int); +void            uvmclear(pagetable_t, uint64); +uint64          walkaddr(pagetable_t, uint64); +int             copyout(pagetable_t, uint64, char *, uint64); +int             copyin(pagetable_t, char *, uint64, uint64); +int             copyinstr(pagetable_t, char *, uint64, uint64); + +// plic.c +void            plicinit(void); +void            plicinithart(void); +uint64          plic_pending(void); +int             plic_claim(void); +void            plic_complete(int); + +// virtio_disk.c +void            virtio_disk_init(void); +void            virtio_disk_rw(struct buf *, int); +void            virtio_disk_intr(); + +// number of elements in fixed-size array +#define NELEM(x) (sizeof(x)/sizeof((x)[0])) diff --git a/kernel/elf.h b/kernel/elf.h new file mode 100644 index 0000000..84555fa --- /dev/null +++ b/kernel/elf.h @@ -0,0 +1,42 @@ +// Format of an ELF executable file + +#define ELF_MAGIC 0x464C457FU  // "\x7FELF" in little endian + +// File header +struct elfhdr { +  uint magic;  // must equal ELF_MAGIC +  uchar elf[12]; +  ushort type; +  ushort machine; +  uint version; +  uint64 entry; +  uint64 phoff; +  uint64 shoff; +  uint flags; +  ushort ehsize; +  ushort phentsize; +  ushort phnum; +  ushort shentsize; +  ushort shnum; +  ushort shstrndx; +}; + +// Program section header +struct proghdr { +  uint32 type; +  uint32 flags; +  uint64 off; +  uint64 vaddr; +  uint64 paddr; +  uint64 filesz; +  uint64 memsz; +  uint64 align; +}; + +// Values for Proghdr type +#define ELF_PROG_LOAD           1 + +// Flag bits for Proghdr flags +#define ELF_PROG_FLAG_EXEC      1 +#define ELF_PROG_FLAG_WRITE     2 +#define ELF_PROG_FLAG_READ      4 diff --git a/kernel/entry.S b/kernel/entry.S new file mode 100644 index 0000000..ef5a56a --- /dev/null +++ b/kernel/entry.S @@ -0,0 +1,26 @@ +	# qemu -kernel starts at 0x1000. the instructions +        # there seem to be provided by qemu, as if it +        # were a ROM. the code at 0x1000 jumps to +        # 0x8000000, the _start function here, +        # in machine mode. each CPU starts here. +.section .data +.globl stack0 +.section .text +.globl start +.section .text +.globl _entry +_entry: +	# set up a stack for C. +        # stack0 is declared in start.c, +        # with a 4096-byte stack per CPU. +        # sp = stack0 + (hartid * 4096) +        la sp, stack0 +        li a0, 1024*4 +	csrr a1, mhartid +        addi a1, a1, 1 +        mul a0, a0, a1 +        add sp, sp, a0 +	# jump to start() in start.c +        call start +junk: +        j junk diff --git a/kernel/exec.c b/kernel/exec.c new file mode 100644 index 0000000..74ef654 --- /dev/null +++ b/kernel/exec.c @@ -0,0 +1,153 @@ +#include "types.h" +#include "param.h" +#include "memlayout.h" +#include "riscv.h" +#include "spinlock.h" +#include "proc.h" +#include "defs.h" +#include "elf.h" + +static int loadseg(pde_t *pgdir, uint64 addr, struct inode *ip, uint offset, uint sz); + +int +exec(char *path, char **argv) +{ +  char *s, *last; +  int i, off; +  uint64 argc, sz, sp, ustack[MAXARG+1], stackbase; +  struct elfhdr elf; +  struct inode *ip; +  struct proghdr ph; +  pagetable_t pagetable = 0, oldpagetable; +  struct proc *p = myproc(); + +  begin_op(); + +  if((ip = namei(path)) == 0){ +    end_op(); +    return -1; +  } +  ilock(ip); + +  // Check ELF header +  if(readi(ip, 0, (uint64)&elf, 0, sizeof(elf)) != sizeof(elf)) +    goto bad; +  if(elf.magic != ELF_MAGIC) +    goto bad; + +  if((pagetable = proc_pagetable(p)) == 0) +    goto bad; + +  // Load program into memory. +  sz = 0; +  for(i=0, off=elf.phoff; i<elf.phnum; i++, off+=sizeof(ph)){ +    if(readi(ip, 0, (uint64)&ph, off, sizeof(ph)) != sizeof(ph)) +      goto bad; +    if(ph.type != ELF_PROG_LOAD) +      continue; +    if(ph.memsz < ph.filesz) +      goto bad; +    if(ph.vaddr + ph.memsz < ph.vaddr) +      goto bad; +    if((sz = uvmalloc(pagetable, sz, ph.vaddr + ph.memsz)) == 0) +      goto bad; +    if(ph.vaddr % PGSIZE != 0) +      goto bad; +    if(loadseg(pagetable, ph.vaddr, ip, ph.off, ph.filesz) < 0) +      goto bad; +  } +  iunlockput(ip); +  end_op(); +  ip = 0; + +  p = myproc(); +  uint64 oldsz = p->sz; + +  // Allocate two pages at the next page boundary. +  // Use the second as the user stack. +  sz = PGROUNDUP(sz); +  if((sz = uvmalloc(pagetable, sz, sz + 2*PGSIZE)) == 0) +    goto bad; +  uvmclear(pagetable, sz-2*PGSIZE); +  sp = sz; +  stackbase = sp - PGSIZE; + +  // Push argument strings, prepare rest of stack in ustack. +  for(argc = 0; argv[argc]; argc++) { +    if(argc >= MAXARG) +      goto bad; +    sp -= strlen(argv[argc]) + 1; +    sp -= sp % 16; // riscv sp must be 16-byte aligned +    if(sp < stackbase) +      goto bad; +    if(copyout(pagetable, sp, argv[argc], strlen(argv[argc]) + 1) < 0) +      goto bad; +    ustack[argc] = sp; +  } +  ustack[argc] = 0; + +  // push the array of argv[] pointers. +  sp -= (argc+1) * sizeof(uint64); +  sp -= sp % 16; +  if(sp < stackbase) +    goto bad; +  if(copyout(pagetable, sp, (char *)ustack, (argc+1)*sizeof(uint64)) < 0) +    goto bad; + +  // arguments to user main(argc, argv) +  // argc is returned via the system call return +  // value, which goes in a0. +  p->tf->a1 = sp; + +  // Save program name for debugging. +  for(last=s=path; *s; s++) +    if(*s == '/') +      last = s+1; +  safestrcpy(p->name, last, sizeof(p->name)); +     +  // Commit to the user image. +  oldpagetable = p->pagetable; +  p->pagetable = pagetable; +  p->sz = sz; +  p->tf->epc = elf.entry;  // initial program counter = main +  p->tf->sp = sp; // initial stack pointer +  proc_freepagetable(oldpagetable, oldsz); +  return argc; // this ends up in a0, the first argument to main(argc, argv) + + bad: +  if(pagetable) +    proc_freepagetable(pagetable, sz); +  if(ip){ +    iunlockput(ip); +    end_op(); +  } +  return -1; +} + +// Load a program segment into pagetable at virtual address va. +// va must be page-aligned +// and the pages from va to va+sz must already be mapped. +// Returns 0 on success, -1 on failure. +static int +loadseg(pagetable_t pagetable, uint64 va, struct inode *ip, uint offset, uint sz) +{ +  uint i, n; +  uint64 pa; + +  if((va % PGSIZE) != 0) +    panic("loadseg: va must be page aligned"); + +  for(i = 0; i < sz; i += PGSIZE){ +    pa = walkaddr(pagetable, va + i); +    if(pa == 0) +      panic("loadseg: address should exist"); +    if(sz - i < PGSIZE) +      n = sz - i; +    else +      n = PGSIZE; +    if(readi(ip, 0, (uint64)pa, offset+i, n) != n) +      return -1; +  } +   +  return 0; +} diff --git a/kernel/fcntl.h b/kernel/fcntl.h new file mode 100644 index 0000000..d565483 --- /dev/null +++ b/kernel/fcntl.h @@ -0,0 +1,4 @@ +#define O_RDONLY  0x000 +#define O_WRONLY  0x001 +#define O_RDWR    0x002 +#define O_CREATE  0x200 diff --git a/kernel/file.c b/kernel/file.c new file mode 100644 index 0000000..7663476 --- /dev/null +++ b/kernel/file.c @@ -0,0 +1,178 @@ +// +// Support functions for system calls that involve file descriptors. +// + +#include "types.h" +#include "riscv.h" +#include "defs.h" +#include "param.h" +#include "fs.h" +#include "spinlock.h" +#include "sleeplock.h" +#include "file.h" +#include "stat.h" +#include "proc.h" + +struct devsw devsw[NDEV]; +struct { +  struct spinlock lock; +  struct file file[NFILE]; +} ftable; + +void +fileinit(void) +{ +  initlock(&ftable.lock, "ftable"); +} + +// Allocate a file structure. +struct file* +filealloc(void) +{ +  struct file *f; + +  acquire(&ftable.lock); +  for(f = ftable.file; f < ftable.file + NFILE; f++){ +    if(f->ref == 0){ +      f->ref = 1; +      release(&ftable.lock); +      return f; +    } +  } +  release(&ftable.lock); +  return 0; +} + +// Increment ref count for file f. +struct file* +filedup(struct file *f) +{ +  acquire(&ftable.lock); +  if(f->ref < 1) +    panic("filedup"); +  f->ref++; +  release(&ftable.lock); +  return f; +} + +// Close file f.  (Decrement ref count, close when reaches 0.) +void +fileclose(struct file *f) +{ +  struct file ff; + +  acquire(&ftable.lock); +  if(f->ref < 1) +    panic("fileclose"); +  if(--f->ref > 0){ +    release(&ftable.lock); +    return; +  } +  ff = *f; +  f->ref = 0; +  f->type = FD_NONE; +  release(&ftable.lock); + +  if(ff.type == FD_PIPE){ +    pipeclose(ff.pipe, ff.writable); +  } else if(ff.type == FD_INODE || ff.type == FD_DEVICE){ +    begin_op(); +    iput(ff.ip); +    end_op(); +  } +} + +// Get metadata about file f. +// addr is a user virtual address, pointing to a struct stat. +int +filestat(struct file *f, uint64 addr) +{ +  struct proc *p = myproc(); +  struct stat st; +   +  if(f->type == FD_INODE || f->type == FD_DEVICE){ +    ilock(f->ip); +    stati(f->ip, &st); +    iunlock(f->ip); +    if(copyout(p->pagetable, addr, (char *)&st, sizeof(st)) < 0) +      return -1; +    return 0; +  } +  return -1; +} + +// Read from file f. +// addr is a user virtual address. +int +fileread(struct file *f, uint64 addr, int n) +{ +  int r = 0; + +  if(f->readable == 0) +    return -1; + +  if(f->type == FD_PIPE){ +    r = piperead(f->pipe, addr, n); +  } else if(f->type == FD_DEVICE){ +    r = devsw[f->major].read(1, addr, n); +  } else if(f->type == FD_INODE){ +    ilock(f->ip); +    if((r = readi(f->ip, 1, addr, f->off, n)) > 0) +      f->off += r; +    iunlock(f->ip); +  } else { +    panic("fileread"); +  } + +  return r; +} + +// Write to file f. +// addr is a user virtual address. +int +filewrite(struct file *f, uint64 addr, int n) +{ +  int r, ret = 0; + +  if(f->writable == 0) +    return -1; + +  if(f->type == FD_PIPE){ +    ret = pipewrite(f->pipe, addr, n); +  } else if(f->type == FD_DEVICE){ +    ret = devsw[f->major].write(1, addr, n); +  } else if(f->type == FD_INODE){ +    // write a few blocks at a time to avoid exceeding +    // the maximum log transaction size, including +    // i-node, indirect block, allocation blocks, +    // and 2 blocks of slop for non-aligned writes. +    // this really belongs lower down, since writei() +    // might be writing a device like the console. +    int max = ((MAXOPBLOCKS-1-1-2) / 2) * 512; +    int i = 0; +    while(i < n){ +      int n1 = n - i; +      if(n1 > max) +        n1 = max; + +      begin_op(); +      ilock(f->ip); +      if ((r = writei(f->ip, 1, addr + i, f->off, n1)) > 0) +        f->off += r; +      iunlock(f->ip); +      end_op(); + +      if(r < 0) +        break; +      if(r != n1) +        panic("short filewrite"); +      i += r; +    } +    ret = (i == n ? n : -1); +  } else { +    panic("filewrite"); +  } + +  return ret; +} + diff --git a/kernel/file.h b/kernel/file.h new file mode 100644 index 0000000..5cf15a2 --- /dev/null +++ b/kernel/file.h @@ -0,0 +1,37 @@ +struct file { +  enum { FD_NONE, FD_PIPE, FD_INODE, FD_DEVICE } type; +  int ref; // reference count +  char readable; +  char writable; +  struct pipe *pipe; // FD_PIPE +  struct inode *ip;  // FD_INODE and FD_DEVICE +  uint off;          // FD_INODE +  short major;       // FD_DEVICE +}; + + +// in-memory copy of an inode +struct inode { +  uint dev;           // Device number +  uint inum;          // Inode number +  int ref;            // Reference count +  struct sleeplock lock; // protects everything below here +  int valid;          // inode has been read from disk? + +  short type;         // copy of disk inode +  short major; +  short minor; +  short nlink; +  uint size; +  uint addrs[NDIRECT+1]; +}; + +// map major device number to device functions. +struct devsw { +  int (*read)(int, uint64, int); +  int (*write)(int, uint64, int); +}; + +extern struct devsw devsw[]; + +#define CONSOLE 1 diff --git a/kernel/fs.c b/kernel/fs.c new file mode 100644 index 0000000..3a74c1f --- /dev/null +++ b/kernel/fs.c @@ -0,0 +1,667 @@ +// File system implementation.  Five layers: +//   + Blocks: allocator for raw disk blocks. +//   + Log: crash recovery for multi-step updates. +//   + Files: inode allocator, reading, writing, metadata. +//   + Directories: inode with special contents (list of other inodes!) +//   + Names: paths like /usr/rtm/xv6/fs.c for convenient naming. +// +// This file contains the low-level file system manipulation +// routines.  The (higher-level) system call implementations +// are in sysfile.c. + +#include "types.h" +#include "riscv.h" +#include "defs.h" +#include "param.h" +#include "stat.h" +#include "spinlock.h" +#include "proc.h" +#include "sleeplock.h" +#include "fs.h" +#include "buf.h" +#include "file.h" + +#define min(a, b) ((a) < (b) ? (a) : (b)) +static void itrunc(struct inode*); +// there should be one superblock per disk device, but we run with +// only one device +struct superblock sb;  + +// Read the super block. +static void +readsb(int dev, struct superblock *sb) +{ +  struct buf *bp; + +  bp = bread(dev, 1); +  memmove(sb, bp->data, sizeof(*sb)); +  brelse(bp); +} + +// Init fs +void +fsinit(int dev) { +  readsb(dev, &sb); +  if(sb.magic != FSMAGIC) +    panic("invalid file system"); +  initlog(dev, &sb); +} + +// Zero a block. +static void +bzero(int dev, int bno) +{ +  struct buf *bp; + +  bp = bread(dev, bno); +  memset(bp->data, 0, BSIZE); +  log_write(bp); +  brelse(bp); +} + +// Blocks. + +// Allocate a zeroed disk block. +static uint +balloc(uint dev) +{ +  int b, bi, m; +  struct buf *bp; + +  bp = 0; +  for(b = 0; b < sb.size; b += BPB){ +    bp = bread(dev, BBLOCK(b, sb)); +    for(bi = 0; bi < BPB && b + bi < sb.size; bi++){ +      m = 1 << (bi % 8); +      if((bp->data[bi/8] & m) == 0){  // Is block free? +        bp->data[bi/8] |= m;  // Mark block in use. +        log_write(bp); +        brelse(bp); +        bzero(dev, b + bi); +        return b + bi; +      } +    } +    brelse(bp); +  } +  panic("balloc: out of blocks"); +} + +// Free a disk block. +static void +bfree(int dev, uint b) +{ +  struct buf *bp; +  int bi, m; + +  readsb(dev, &sb); +  bp = bread(dev, BBLOCK(b, sb)); +  bi = b % BPB; +  m = 1 << (bi % 8); +  if((bp->data[bi/8] & m) == 0) +    panic("freeing free block"); +  bp->data[bi/8] &= ~m; +  log_write(bp); +  brelse(bp); +} + +// Inodes. +// +// An inode describes a single unnamed file. +// The inode disk structure holds metadata: the file's type, +// its size, the number of links referring to it, and the +// list of blocks holding the file's content. +// +// The inodes are laid out sequentially on disk at +// sb.startinode. Each inode has a number, indicating its +// position on the disk. +// +// The kernel keeps a cache of in-use inodes in memory +// to provide a place for synchronizing access +// to inodes used by multiple processes. The cached +// inodes include book-keeping information that is +// not stored on disk: ip->ref and ip->valid. +// +// An inode and its in-memory representation go through a +// sequence of states before they can be used by the +// rest of the file system code. +// +// * Allocation: an inode is allocated if its type (on disk) +//   is non-zero. ialloc() allocates, and iput() frees if +//   the reference and link counts have fallen to zero. +// +// * Referencing in cache: an entry in the inode cache +//   is free if ip->ref is zero. Otherwise ip->ref tracks +//   the number of in-memory pointers to the entry (open +//   files and current directories). iget() finds or +//   creates a cache entry and increments its ref; iput() +//   decrements ref. +// +// * Valid: the information (type, size, &c) in an inode +//   cache entry is only correct when ip->valid is 1. +//   ilock() reads the inode from +//   the disk and sets ip->valid, while iput() clears +//   ip->valid if ip->ref has fallen to zero. +// +// * Locked: file system code may only examine and modify +//   the information in an inode and its content if it +//   has first locked the inode. +// +// Thus a typical sequence is: +//   ip = iget(dev, inum) +//   ilock(ip) +//   ... examine and modify ip->xxx ... +//   iunlock(ip) +//   iput(ip) +// +// ilock() is separate from iget() so that system calls can +// get a long-term reference to an inode (as for an open file) +// and only lock it for short periods (e.g., in read()). +// The separation also helps avoid deadlock and races during +// pathname lookup. iget() increments ip->ref so that the inode +// stays cached and pointers to it remain valid. +// +// Many internal file system functions expect the caller to +// have locked the inodes involved; this lets callers create +// multi-step atomic operations. +// +// The icache.lock spin-lock protects the allocation of icache +// entries. Since ip->ref indicates whether an entry is free, +// and ip->dev and ip->inum indicate which i-node an entry +// holds, one must hold icache.lock while using any of those fields. +// +// An ip->lock sleep-lock protects all ip-> fields other than ref, +// dev, and inum.  One must hold ip->lock in order to +// read or write that inode's ip->valid, ip->size, ip->type, &c. + +struct { +  struct spinlock lock; +  struct inode inode[NINODE]; +} icache; + +void +iinit() +{ +  int i = 0; +   +  initlock(&icache.lock, "icache"); +  for(i = 0; i < NINODE; i++) { +    initsleeplock(&icache.inode[i].lock, "inode"); +  } +} + +static struct inode* iget(uint dev, uint inum); + +// Allocate an inode on device dev. +// Mark it as allocated by  giving it type type. +// Returns an unlocked but allocated and referenced inode. +struct inode* +ialloc(uint dev, short type) +{ +  int inum; +  struct buf *bp; +  struct dinode *dip; + +  for(inum = 1; inum < sb.ninodes; inum++){ +    bp = bread(dev, IBLOCK(inum, sb)); +    dip = (struct dinode*)bp->data + inum%IPB; +    if(dip->type == 0){  // a free inode +      memset(dip, 0, sizeof(*dip)); +      dip->type = type; +      log_write(bp);   // mark it allocated on the disk +      brelse(bp); +      return iget(dev, inum); +    } +    brelse(bp); +  } +  panic("ialloc: no inodes"); +} + +// Copy a modified in-memory inode to disk. +// Must be called after every change to an ip->xxx field +// that lives on disk, since i-node cache is write-through. +// Caller must hold ip->lock. +void +iupdate(struct inode *ip) +{ +  struct buf *bp; +  struct dinode *dip; + +  bp = bread(ip->dev, IBLOCK(ip->inum, sb)); +  dip = (struct dinode*)bp->data + ip->inum%IPB; +  dip->type = ip->type; +  dip->major = ip->major; +  dip->minor = ip->minor; +  dip->nlink = ip->nlink; +  dip->size = ip->size; +  memmove(dip->addrs, ip->addrs, sizeof(ip->addrs)); +  log_write(bp); +  brelse(bp); +} + +// Find the inode with number inum on device dev +// and return the in-memory copy. Does not lock +// the inode and does not read it from disk. +static struct inode* +iget(uint dev, uint inum) +{ +  struct inode *ip, *empty; + +  acquire(&icache.lock); + +  // Is the inode already cached? +  empty = 0; +  for(ip = &icache.inode[0]; ip < &icache.inode[NINODE]; ip++){ +    if(ip->ref > 0 && ip->dev == dev && ip->inum == inum){ +      ip->ref++; +      release(&icache.lock); +      return ip; +    } +    if(empty == 0 && ip->ref == 0)    // Remember empty slot. +      empty = ip; +  } + +  // Recycle an inode cache entry. +  if(empty == 0) +    panic("iget: no inodes"); + +  ip = empty; +  ip->dev = dev; +  ip->inum = inum; +  ip->ref = 1; +  ip->valid = 0; +  release(&icache.lock); + +  return ip; +} + +// Increment reference count for ip. +// Returns ip to enable ip = idup(ip1) idiom. +struct inode* +idup(struct inode *ip) +{ +  acquire(&icache.lock); +  ip->ref++; +  release(&icache.lock); +  return ip; +} + +// Lock the given inode. +// Reads the inode from disk if necessary. +void +ilock(struct inode *ip) +{ +  struct buf *bp; +  struct dinode *dip; + +  if(ip == 0 || ip->ref < 1) +    panic("ilock"); + +  acquiresleep(&ip->lock); + +  if(ip->valid == 0){ +    bp = bread(ip->dev, IBLOCK(ip->inum, sb)); +    dip = (struct dinode*)bp->data + ip->inum%IPB; +    ip->type = dip->type; +    ip->major = dip->major; +    ip->minor = dip->minor; +    ip->nlink = dip->nlink; +    ip->size = dip->size; +    memmove(ip->addrs, dip->addrs, sizeof(ip->addrs)); +    brelse(bp); +    ip->valid = 1; +    if(ip->type == 0) +      panic("ilock: no type"); +  } +} + +// Unlock the given inode. +void +iunlock(struct inode *ip) +{ +  if(ip == 0 || !holdingsleep(&ip->lock) || ip->ref < 1) +    panic("iunlock"); + +  releasesleep(&ip->lock); +} + +// Drop a reference to an in-memory inode. +// If that was the last reference, the inode cache entry can +// be recycled. +// If that was the last reference and the inode has no links +// to it, free the inode (and its content) on disk. +// All calls to iput() must be inside a transaction in +// case it has to free the inode. +void +iput(struct inode *ip) +{ +  acquire(&icache.lock); + +  if(ip->ref == 1 && ip->valid && ip->nlink == 0){ +    // inode has no links and no other references: truncate and free. + +    // ip->ref == 1 means no other process can have ip locked, +    // so this acquiresleep() won't block (or deadlock). +    acquiresleep(&ip->lock); + +    release(&icache.lock); + +    itrunc(ip); +    ip->type = 0; +    iupdate(ip); +    ip->valid = 0; + +    releasesleep(&ip->lock); + +    acquire(&icache.lock); +  } + +  ip->ref--; +  release(&icache.lock); +} + +// Common idiom: unlock, then put. +void +iunlockput(struct inode *ip) +{ +  iunlock(ip); +  iput(ip); +} + +// Inode content +// +// The content (data) associated with each inode is stored +// in blocks on the disk. The first NDIRECT block numbers +// are listed in ip->addrs[].  The next NINDIRECT blocks are +// listed in block ip->addrs[NDIRECT]. + +// Return the disk block address of the nth block in inode ip. +// If there is no such block, bmap allocates one. +static uint +bmap(struct inode *ip, uint bn) +{ +  uint addr, *a; +  struct buf *bp; + +  if(bn < NDIRECT){ +    if((addr = ip->addrs[bn]) == 0) +      ip->addrs[bn] = addr = balloc(ip->dev); +    return addr; +  } +  bn -= NDIRECT; + +  if(bn < NINDIRECT){ +    // Load indirect block, allocating if necessary. +    if((addr = ip->addrs[NDIRECT]) == 0) +      ip->addrs[NDIRECT] = addr = balloc(ip->dev); +    bp = bread(ip->dev, addr); +    a = (uint*)bp->data; +    if((addr = a[bn]) == 0){ +      a[bn] = addr = balloc(ip->dev); +      log_write(bp); +    } +    brelse(bp); +    return addr; +  } + +  panic("bmap: out of range"); +} + +// Truncate inode (discard contents). +// Only called when the inode has no links +// to it (no directory entries referring to it) +// and has no in-memory reference to it (is +// not an open file or current directory). +static void +itrunc(struct inode *ip) +{ +  int i, j; +  struct buf *bp; +  uint *a; + +  for(i = 0; i < NDIRECT; i++){ +    if(ip->addrs[i]){ +      bfree(ip->dev, ip->addrs[i]); +      ip->addrs[i] = 0; +    } +  } + +  if(ip->addrs[NDIRECT]){ +    bp = bread(ip->dev, ip->addrs[NDIRECT]); +    a = (uint*)bp->data; +    for(j = 0; j < NINDIRECT; j++){ +      if(a[j]) +        bfree(ip->dev, a[j]); +    } +    brelse(bp); +    bfree(ip->dev, ip->addrs[NDIRECT]); +    ip->addrs[NDIRECT] = 0; +  } + +  ip->size = 0; +  iupdate(ip); +} + +// Copy stat information from inode. +// Caller must hold ip->lock. +void +stati(struct inode *ip, struct stat *st) +{ +  st->dev = ip->dev; +  st->ino = ip->inum; +  st->type = ip->type; +  st->nlink = ip->nlink; +  st->size = ip->size; +} + +// Read data from inode. +// Caller must hold ip->lock. +// If user_dst==1, then dst is a user virtual address; +// otherwise, dst is a kernel address. +int +readi(struct inode *ip, int user_dst, uint64 dst, uint off, uint n) +{ +  uint tot, m; +  struct buf *bp; + +  if(off > ip->size || off + n < off) +    return -1; +  if(off + n > ip->size) +    n = ip->size - off; + +  for(tot=0; tot<n; tot+=m, off+=m, dst+=m){ +    bp = bread(ip->dev, bmap(ip, off/BSIZE)); +    m = min(n - tot, BSIZE - off%BSIZE); +    if(either_copyout(user_dst, dst, bp->data + (off % BSIZE), m) == -1) +      break; +    brelse(bp); +  } +  return n; +} + +// Write data to inode. +// Caller must hold ip->lock. +// If user_src==1, then src is a user virtual address; +// otherwise, src is a kernel address. +int +writei(struct inode *ip, int user_src, uint64 src, uint off, uint n) +{ +  uint tot, m; +  struct buf *bp; + +  if(off > ip->size || off + n < off) +    return -1; +  if(off + n > MAXFILE*BSIZE) +    return -1; + +  for(tot=0; tot<n; tot+=m, off+=m, src+=m){ +    bp = bread(ip->dev, bmap(ip, off/BSIZE)); +    m = min(n - tot, BSIZE - off%BSIZE); +    if(either_copyin(bp->data + (off % BSIZE), user_src, src, m) == -1) +      break; +    log_write(bp); +    brelse(bp); +  } + +  if(n > 0 && off > ip->size){ +    ip->size = off; +    iupdate(ip); +  } +  return n; +} + +// Directories + +int +namecmp(const char *s, const char *t) +{ +  return strncmp(s, t, DIRSIZ); +} + +// Look for a directory entry in a directory. +// If found, set *poff to byte offset of entry. +struct inode* +dirlookup(struct inode *dp, char *name, uint *poff) +{ +  uint off, inum; +  struct dirent de; + +  if(dp->type != T_DIR) +    panic("dirlookup not DIR"); + +  for(off = 0; off < dp->size; off += sizeof(de)){ +    if(readi(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de)) +      panic("dirlookup read"); +    if(de.inum == 0) +      continue; +    if(namecmp(name, de.name) == 0){ +      // entry matches path element +      if(poff) +        *poff = off; +      inum = de.inum; +      return iget(dp->dev, inum); +    } +  } + +  return 0; +} + +// Write a new directory entry (name, inum) into the directory dp. +int +dirlink(struct inode *dp, char *name, uint inum) +{ +  int off; +  struct dirent de; +  struct inode *ip; + +  // Check that name is not present. +  if((ip = dirlookup(dp, name, 0)) != 0){ +    iput(ip); +    return -1; +  } + +  // Look for an empty dirent. +  for(off = 0; off < dp->size; off += sizeof(de)){ +    if(readi(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de)) +      panic("dirlink read"); +    if(de.inum == 0) +      break; +  } + +  strncpy(de.name, name, DIRSIZ); +  de.inum = inum; +  if(writei(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de)) +    panic("dirlink"); + +  return 0; +} + +// Paths + +// Copy the next path element from path into name. +// Return a pointer to the element following the copied one. +// The returned path has no leading slashes, +// so the caller can check *path=='\0' to see if the name is the last one. +// If no name to remove, return 0. +// +// Examples: +//   skipelem("a/bb/c", name) = "bb/c", setting name = "a" +//   skipelem("///a//bb", name) = "bb", setting name = "a" +//   skipelem("a", name) = "", setting name = "a" +//   skipelem("", name) = skipelem("////", name) = 0 +// +static char* +skipelem(char *path, char *name) +{ +  char *s; +  int len; + +  while(*path == '/') +    path++; +  if(*path == 0) +    return 0; +  s = path; +  while(*path != '/' && *path != 0) +    path++; +  len = path - s; +  if(len >= DIRSIZ) +    memmove(name, s, DIRSIZ); +  else { +    memmove(name, s, len); +    name[len] = 0; +  } +  while(*path == '/') +    path++; +  return path; +} + +// Look up and return the inode for a path name. +// If parent != 0, return the inode for the parent and copy the final +// path element into name, which must have room for DIRSIZ bytes. +// Must be called inside a transaction since it calls iput(). +static struct inode* +namex(char *path, int nameiparent, char *name) +{ +  struct inode *ip, *next; + +  if(*path == '/') +    ip = iget(ROOTDEV, ROOTINO); +  else +    ip = idup(myproc()->cwd); + +  while((path = skipelem(path, name)) != 0){ +    ilock(ip); +    if(ip->type != T_DIR){ +      iunlockput(ip); +      return 0; +    } +    if(nameiparent && *path == '\0'){ +      // Stop one level early. +      iunlock(ip); +      return ip; +    } +    if((next = dirlookup(ip, name, 0)) == 0){ +      iunlockput(ip); +      return 0; +    } +    iunlockput(ip); +    ip = next; +  } +  if(nameiparent){ +    iput(ip); +    return 0; +  } +  return ip; +} + +struct inode* +namei(char *path) +{ +  char name[DIRSIZ]; +  return namex(path, 0, name); +} + +struct inode* +nameiparent(char *path, char *name) +{ +  return namex(path, 1, name); +} diff --git a/kernel/fs.h b/kernel/fs.h new file mode 100644 index 0000000..139dcc9 --- /dev/null +++ b/kernel/fs.h @@ -0,0 +1,60 @@ +// On-disk file system format. +// Both the kernel and user programs use this header file. + + +#define ROOTINO  1   // root i-number +#define BSIZE 1024  // block size + +// Disk layout: +// [ boot block | super block | log | inode blocks | +//                                          free bit map | data blocks] +// +// mkfs computes the super block and builds an initial file system. The +// super block describes the disk layout: +struct superblock { +  uint magic;        // Must be FSMAGIC +  uint size;         // Size of file system image (blocks) +  uint nblocks;      // Number of data blocks +  uint ninodes;      // Number of inodes. +  uint nlog;         // Number of log blocks +  uint logstart;     // Block number of first log block +  uint inodestart;   // Block number of first inode block +  uint bmapstart;    // Block number of first free map block +}; + +#define FSMAGIC 0x10203040 + +#define NDIRECT 12 +#define NINDIRECT (BSIZE / sizeof(uint)) +#define MAXFILE (NDIRECT + NINDIRECT) + +// On-disk inode structure +struct dinode { +  short type;           // File type +  short major;          // Major device number (T_DEVICE only) +  short minor;          // Minor device number (T_DEVICE only) +  short nlink;          // Number of links to inode in file system +  uint size;            // Size of file (bytes) +  uint addrs[NDIRECT+1];   // Data block addresses +}; + +// Inodes per block. +#define IPB           (BSIZE / sizeof(struct dinode)) + +// Block containing inode i +#define IBLOCK(i, sb)     ((i) / IPB + sb.inodestart) + +// Bitmap bits per block +#define BPB           (BSIZE*8) + +// Block of free map containing bit for block b +#define BBLOCK(b, sb) ((b)/BPB + sb.bmapstart) + +// Directory is a file containing a sequence of dirent structures. +#define DIRSIZ 14 + +struct dirent { +  ushort inum; +  char name[DIRSIZ]; +}; + diff --git a/kernel/kalloc.c b/kernel/kalloc.c new file mode 100644 index 0000000..ae3863b --- /dev/null +++ b/kernel/kalloc.c @@ -0,0 +1,83 @@ +// Physical memory allocator, for user processes, +// kernel stacks, page-table pages, +// and pipe buffers. Allocates whole 4096-byte pages. + +#include "types.h" +#include "param.h" +#include "memlayout.h" +#include "spinlock.h" +#include "riscv.h" +#include "defs.h" + +void freerange(void *pa_start, void *pa_end); + +extern char end[]; // first address after kernel. +                   // defined by kernel.ld. + +struct run { +  struct run *next; +}; + +struct { +  struct spinlock lock; +  struct run *freelist; +} kmem; + +void +kinit() +{ +  initlock(&kmem.lock, "kmem"); +  freerange(end, (void*)PHYSTOP); +} + +void +freerange(void *pa_start, void *pa_end) +{ +  char *p; +  p = (char*)PGROUNDUP((uint64)pa_start); +  p += 4096; // XXX I can't get kernel.ld to place end beyond the last bss symbol. +  for(; p + PGSIZE <= (char*)pa_end; p += PGSIZE) +    kfree(p); +} + +// Free the page of physical memory pointed at by v, +// which normally should have been returned by a +// call to kalloc().  (The exception is when +// initializing the allocator; see kinit above.) +void +kfree(void *pa) +{ +  struct run *r; + +  if(((uint64)pa % PGSIZE) != 0 || (char*)pa < end || (uint64)pa >= PHYSTOP) +    panic("kfree"); + +  // Fill with junk to catch dangling refs. +  memset(pa, 1, PGSIZE); + +  r = (struct run*)pa; + +  acquire(&kmem.lock); +  r->next = kmem.freelist; +  kmem.freelist = r; +  release(&kmem.lock); +} + +// Allocate one 4096-byte page of physical memory. +// Returns a pointer that the kernel can use. +// Returns 0 if the memory cannot be allocated. +void * +kalloc(void) +{ +  struct run *r; + +  acquire(&kmem.lock); +  r = kmem.freelist; +  if(r) +    kmem.freelist = r->next; +  release(&kmem.lock); + +  if(r) +    memset((char*)r, 5, PGSIZE); // fill with junk +  return (void*)r; +} diff --git a/kernel/kernel.ld b/kernel/kernel.ld new file mode 100644 index 0000000..0b5e76b --- /dev/null +++ b/kernel/kernel.ld @@ -0,0 +1,32 @@ +OUTPUT_ARCH( "riscv" ) +ENTRY( _entry ) + +SECTIONS +{ +  /* +   * ensure that entry.S / _entry is at 0x80000000, +   * where qemu's -kernel jumps. +   */ +  . = 0x80000000; +  .text : +  { +    *(.text) +    . = ALIGN(0x1000); +    *(trampsec) +  } + +  . = ALIGN(0x1000); +  PROVIDE(etext = .); + +  /* +   * make sure end is after data and bss. +   */ +  .data : { +    *(.data) +  } +  bss : { +    *(.bss) +    PROVIDE(end = .); +  } + +} diff --git a/kernel/kernelvec.S b/kernel/kernelvec.S new file mode 100644 index 0000000..3e9d3e9 --- /dev/null +++ b/kernel/kernelvec.S @@ -0,0 +1,121 @@ +	# +        # interrupts and exceptions while in supervisor +        # mode come here. +        # +        # push all registers, call kerneltrap(), restore, return. +        # +.globl kerneltrap +.globl kernelvec +.align 4 +kernelvec: +        // make room to save registers. +        addi sp, sp, -256 + +        // save the registers. +        sd ra, 0(sp) +        sd sp, 8(sp) +        sd gp, 16(sp) +        sd tp, 24(sp) +        sd t0, 32(sp) +        sd t1, 40(sp) +        sd t2, 48(sp) +        sd s0, 56(sp) +        sd s1, 64(sp) +        sd a0, 72(sp) +        sd a1, 80(sp) +        sd a2, 88(sp) +        sd a3, 96(sp) +        sd a4, 104(sp) +        sd a5, 112(sp) +        sd a6, 120(sp) +        sd a7, 128(sp) +        sd s2, 136(sp) +        sd s3, 144(sp) +        sd s4, 152(sp) +        sd s5, 160(sp) +        sd s6, 168(sp) +        sd s7, 176(sp) +        sd s8, 184(sp) +        sd s9, 192(sp) +        sd s10, 200(sp) +        sd s11, 208(sp) +        sd t3, 216(sp) +        sd t4, 224(sp) +        sd t5, 232(sp) +        sd t6, 240(sp) + +	// call the C trap handler in trap.c +        call kerneltrap + +        // restore registers. +        ld ra, 0(sp) +        ld sp, 8(sp) +        ld gp, 16(sp) +        // not this, in case we moved CPUs: ld tp, 24(sp) +        ld t0, 32(sp) +        ld t1, 40(sp) +        ld t2, 48(sp) +        ld s0, 56(sp) +        ld s1, 64(sp) +        ld a0, 72(sp) +        ld a1, 80(sp) +        ld a2, 88(sp) +        ld a3, 96(sp) +        ld a4, 104(sp) +        ld a5, 112(sp) +        ld a6, 120(sp) +        ld a7, 128(sp) +        ld s2, 136(sp) +        ld s3, 144(sp) +        ld s4, 152(sp) +        ld s5, 160(sp) +        ld s6, 168(sp) +        ld s7, 176(sp) +        ld s8, 184(sp) +        ld s9, 192(sp) +        ld s10, 200(sp) +        ld s11, 208(sp) +        ld t3, 216(sp) +        ld t4, 224(sp) +        ld t5, 232(sp) +        ld t6, 240(sp) + +        addi sp, sp, 256 + +        // return to whatever we were doing in the kernel. +        sret + +        # +        # machine-mode timer interrupt. +        # +.globl timervec +.align 4 +timervec: +        # start.c has set up the memory that mscratch points to: +        # scratch[0,8,16] : register save area. +        # scratch[32] : address of CLINT's MTIMECMP register. +        # scratch[40] : desired interval between interrupts. +         +        csrrw a0, mscratch, a0 +        sd a1, 0(a0) +        sd a2, 8(a0) +        sd a3, 16(a0) + +        # schedule the next timer interrupt +        # by adding interval to mtimecmp. +        ld a1, 32(a0) # CLINT_MTIMECMP(hart) +        ld a2, 40(a0) # interval +        ld a3, 0(a1) +        add a3, a3, a2 +        sd a3, 0(a1) + +        # raise a supervisor software interrupt. +	li a1, 2 +        csrw sip, a1 + +        ld a3, 16(a0) +        ld a2, 8(a0) +        ld a1, 0(a0) +        csrrw a0, mscratch, a0 + +        mret diff --git a/kernel/log.c b/kernel/log.c new file mode 100644 index 0000000..5e884bb --- /dev/null +++ b/kernel/log.c @@ -0,0 +1,235 @@ +#include "types.h" +#include "riscv.h" +#include "defs.h" +#include "param.h" +#include "spinlock.h" +#include "sleeplock.h" +#include "fs.h" +#include "buf.h" + +// Simple logging that allows concurrent FS system calls. +// +// A log transaction contains the updates of multiple FS system +// calls. The logging system only commits when there are +// no FS system calls active. Thus there is never +// any reasoning required about whether a commit might +// write an uncommitted system call's updates to disk. +// +// A system call should call begin_op()/end_op() to mark +// its start and end. Usually begin_op() just increments +// the count of in-progress FS system calls and returns. +// But if it thinks the log is close to running out, it +// sleeps until the last outstanding end_op() commits. +// +// The log is a physical re-do log containing disk blocks. +// The on-disk log format: +//   header block, containing block #s for block A, B, C, ... +//   block A +//   block B +//   block C +//   ... +// Log appends are synchronous. + +// Contents of the header block, used for both the on-disk header block +// and to keep track in memory of logged block# before commit. +struct logheader { +  int n; +  int block[LOGSIZE]; +}; + +struct log { +  struct spinlock lock; +  int start; +  int size; +  int outstanding; // how many FS sys calls are executing. +  int committing;  // in commit(), please wait. +  int dev; +  struct logheader lh; +}; +struct log log; + +static void recover_from_log(void); +static void commit(); + +void +initlog(int dev, struct superblock *sb) +{ +  if (sizeof(struct logheader) >= BSIZE) +    panic("initlog: too big logheader"); + +  initlock(&log.lock, "log"); +  log.start = sb->logstart; +  log.size = sb->nlog; +  log.dev = dev; +  recover_from_log(); +} + +// Copy committed blocks from log to their home location +static void +install_trans(void) +{ +  int tail; + +  for (tail = 0; tail < log.lh.n; tail++) { +    struct buf *lbuf = bread(log.dev, log.start+tail+1); // read log block +    struct buf *dbuf = bread(log.dev, log.lh.block[tail]); // read dst +    memmove(dbuf->data, lbuf->data, BSIZE);  // copy block to dst +    bwrite(dbuf);  // write dst to disk +    bunpin(dbuf); +    brelse(lbuf); +    brelse(dbuf); +  } +} + +// Read the log header from disk into the in-memory log header +static void +read_head(void) +{ +  struct buf *buf = bread(log.dev, log.start); +  struct logheader *lh = (struct logheader *) (buf->data); +  int i; +  log.lh.n = lh->n; +  for (i = 0; i < log.lh.n; i++) { +    log.lh.block[i] = lh->block[i]; +  } +  brelse(buf); +} + +// Write in-memory log header to disk. +// This is the true point at which the +// current transaction commits. +static void +write_head(void) +{ +  struct buf *buf = bread(log.dev, log.start); +  struct logheader *hb = (struct logheader *) (buf->data); +  int i; +  hb->n = log.lh.n; +  for (i = 0; i < log.lh.n; i++) { +    hb->block[i] = log.lh.block[i]; +  } +  bwrite(buf); +  brelse(buf); +} + +static void +recover_from_log(void) +{ +  read_head(); +  install_trans(); // if committed, copy from log to disk +  log.lh.n = 0; +  write_head(); // clear the log +} + +// called at the start of each FS system call. +void +begin_op(void) +{ +  acquire(&log.lock); +  while(1){ +    if(log.committing){ +      sleep(&log, &log.lock); +    } else if(log.lh.n + (log.outstanding+1)*MAXOPBLOCKS > LOGSIZE){ +      // this op might exhaust log space; wait for commit. +      sleep(&log, &log.lock); +    } else { +      log.outstanding += 1; +      release(&log.lock); +      break; +    } +  } +} + +// called at the end of each FS system call. +// commits if this was the last outstanding operation. +void +end_op(void) +{ +  int do_commit = 0; + +  acquire(&log.lock); +  log.outstanding -= 1; +  if(log.committing) +    panic("log.committing"); +  if(log.outstanding == 0){ +    do_commit = 1; +    log.committing = 1; +  } else { +    // begin_op() may be waiting for log space, +    // and decrementing log.outstanding has decreased +    // the amount of reserved space. +    wakeup(&log); +  } +  release(&log.lock); + +  if(do_commit){ +    // call commit w/o holding locks, since not allowed +    // to sleep with locks. +    commit(); +    acquire(&log.lock); +    log.committing = 0; +    wakeup(&log); +    release(&log.lock); +  } +} + +// Copy modified blocks from cache to log. +static void +write_log(void) +{ +  int tail; + +  for (tail = 0; tail < log.lh.n; tail++) { +    struct buf *to = bread(log.dev, log.start+tail+1); // log block +    struct buf *from = bread(log.dev, log.lh.block[tail]); // cache block +    memmove(to->data, from->data, BSIZE); +    bwrite(to);  // write the log +    brelse(from); +    brelse(to); +  } +} + +static void +commit() +{ +  if (log.lh.n > 0) { +    write_log();     // Write modified blocks from cache to log +    write_head();    // Write header to disk -- the real commit +    install_trans(); // Now install writes to home locations +    log.lh.n = 0; +    write_head();    // Erase the transaction from the log +  } +} + +// Caller has modified b->data and is done with the buffer. +// Record the block number and pin in the cache by increasing refcnt. +// commit()/write_log() will do the disk write. +// +// log_write() replaces bwrite(); a typical use is: +//   bp = bread(...) +//   modify bp->data[] +//   log_write(bp) +//   brelse(bp) +void +log_write(struct buf *b) +{ +  int i; + +  if (log.lh.n >= LOGSIZE || log.lh.n >= log.size - 1) +    panic("too big a transaction"); +  if (log.outstanding < 1) +    panic("log_write outside of trans"); + +  acquire(&log.lock); +  for (i = 0; i < log.lh.n; i++) { +    if (log.lh.block[i] == b->blockno)   // log absorbtion +      break; +  } +  log.lh.block[i] = b->blockno; +  if (i == log.lh.n) {  // Add new block to log? +    bpin(b); +    log.lh.n++; +  } +  release(&log.lock); +} + diff --git a/kernel/main.c b/kernel/main.c new file mode 100644 index 0000000..a936fd3 --- /dev/null +++ b/kernel/main.c @@ -0,0 +1,43 @@ +#include "types.h" +#include "param.h" +#include "memlayout.h" +#include "riscv.h" +#include "defs.h" + +volatile static int started = 0; + +// start() jumps here in supervisor mode on all CPUs. +void +main() +{ +  if(cpuid() == 0){ +    consoleinit(); +    printfinit(); +    printf("hart %d starting\n", cpuid()); +    kinit();         // physical page allocator +    kvminit();       // create kernel page table +    kvminithart();   // turn on paging +    procinit();      // process table +    trapinit();      // trap vectors +    trapinithart();  // install kernel trap vector +    plicinit();      // set up interrupt controller +    plicinithart();  // ask PLIC for device interrupts +    binit();         // buffer cache +    iinit();         // inode cache +    fileinit();      // file table +    virtio_disk_init(); // emulated hard disk +    userinit();      // first user process +    __sync_synchronize(); +    started = 1; +  } else { +    while(started == 0) +      ; +    __sync_synchronize(); +    printf("hart %d starting\n", cpuid()); +    kvminithart();    // turn on paging +    trapinithart();   // install kernel trap vector +    plicinithart();   // ask PLIC for device interrupts +  } + +  scheduler();         +} diff --git a/kernel/memlayout.h b/kernel/memlayout.h new file mode 100644 index 0000000..8ffd538 --- /dev/null +++ b/kernel/memlayout.h @@ -0,0 +1,67 @@ +// Physical memory layout + +// qemu -machine virt is set up like this, +// based on qemu's hw/riscv/virt.c: +// +// 00001000 -- boot ROM, provided by qemu +// 02000000 -- CLINT +// 0C000000 -- PLIC +// 10000000 -- uart0  +// 10001000 -- virtio disk  +// 80000000 -- boot ROM jumps here in machine mode +//             -kernel loads the kernel here +// unused RAM after 80000000. + +// the kernel uses physical memory thus: +// 80000000 -- entry.S, then kernel text and data +// end -- start of kernel page allocation area +// PHYSTOP -- end RAM used by the kernel + +// qemu puts UART registers here in physical memory. +#define UART0 0x10000000L +#define UART0_IRQ 10 + +// virtio mmio interface +#define VIRTIO0 0x10001000 +#define VIRTIO0_IRQ 1 + +// local interrupt controller, which contains the timer. +#define CLINT 0x2000000L +#define CLINT_MTIMECMP(hartid) (CLINT + 0x4000 + 8*(hartid)) +#define CLINT_MTIME (CLINT + 0xBFF8) // cycles since boot. + +// qemu puts programmable interrupt controller here. +#define PLIC 0x0c000000L +#define PLIC_PRIORITY (PLIC + 0x0) +#define PLIC_PENDING (PLIC + 0x1000) +#define PLIC_MENABLE(hart) (PLIC + 0x2000 + (hart)*0x100) +#define PLIC_SENABLE(hart) (PLIC + 0x2080 + (hart)*0x100) +#define PLIC_MPRIORITY(hart) (PLIC + 0x200000 + (hart)*0x2000) +#define PLIC_SPRIORITY(hart) (PLIC + 0x201000 + (hart)*0x2000) +#define PLIC_MCLAIM(hart) (PLIC + 0x200004 + (hart)*0x2000) +#define PLIC_SCLAIM(hart) (PLIC + 0x201004 + (hart)*0x2000) + +// the kernel expects there to be RAM +// for use by the kernel and user pages +// from physical address 0x80000000 to PHYSTOP. +#define KERNBASE 0x80000000L +#define PHYSTOP (KERNBASE + 128*1024*1024) + +// map the trampoline page to the highest address, +// in both user and kernel space. +#define TRAMPOLINE (MAXVA - PGSIZE) + +// map kernel stacks beneath the trampoline, +// each surrounded by invalid guard pages. +#define KSTACK(p) (TRAMPOLINE - ((p)+1)* 2*PGSIZE) + +// User memory layout. +// Address zero first: +//   text +//   original data and bss +//   fixed-size stack +//   expandable heap +//   ... +//   TRAPFRAME (p->tf, used by the trampoline) +//   TRAMPOLINE (the same page as in the kernel) +#define TRAPFRAME (TRAMPOLINE - PGSIZE) diff --git a/kernel/param.h b/kernel/param.h new file mode 100644 index 0000000..b5fdcb2 --- /dev/null +++ b/kernel/param.h @@ -0,0 +1,13 @@ +#define NPROC        64  // maximum number of processes +#define NCPU          8  // maximum number of CPUs +#define NOFILE       16  // open files per process +#define NFILE       100  // open files per system +#define NINODE       50  // maximum number of active i-nodes +#define NDEV         10  // maximum major device number +#define ROOTDEV       1  // device number of file system root disk +#define MAXARG       32  // max exec arguments +#define MAXOPBLOCKS  10  // max # of blocks any FS op writes +#define LOGSIZE      (MAXOPBLOCKS*3)  // max data blocks in on-disk log +#define NBUF         (MAXOPBLOCKS*3)  // size of disk block cache +#define FSSIZE       1000  // size of file system in blocks +#define MAXPATH      128   // maximum file path name diff --git a/kernel/pipe.c b/kernel/pipe.c new file mode 100644 index 0000000..c3a8acf --- /dev/null +++ b/kernel/pipe.c @@ -0,0 +1,127 @@ +#include "types.h" +#include "riscv.h" +#include "defs.h" +#include "param.h" +#include "spinlock.h" +#include "proc.h" +#include "fs.h" +#include "sleeplock.h" +#include "file.h" + +#define PIPESIZE 512 + +struct pipe { +  struct spinlock lock; +  char data[PIPESIZE]; +  uint nread;     // number of bytes read +  uint nwrite;    // number of bytes written +  int readopen;   // read fd is still open +  int writeopen;  // write fd is still open +}; + +int +pipealloc(struct file **f0, struct file **f1) +{ +  struct pipe *pi; + +  pi = 0; +  *f0 = *f1 = 0; +  if((*f0 = filealloc()) == 0 || (*f1 = filealloc()) == 0) +    goto bad; +  if((pi = (struct pipe*)kalloc()) == 0) +    goto bad; +  pi->readopen = 1; +  pi->writeopen = 1; +  pi->nwrite = 0; +  pi->nread = 0; +  initlock(&pi->lock, "pipe"); +  (*f0)->type = FD_PIPE; +  (*f0)->readable = 1; +  (*f0)->writable = 0; +  (*f0)->pipe = pi; +  (*f1)->type = FD_PIPE; +  (*f1)->readable = 0; +  (*f1)->writable = 1; +  (*f1)->pipe = pi; +  return 0; + + bad: +  if(pi) +    kfree((char*)pi); +  if(*f0) +    fileclose(*f0); +  if(*f1) +    fileclose(*f1); +  return -1; +} + +void +pipeclose(struct pipe *pi, int writable) +{ +  acquire(&pi->lock); +  if(writable){ +    pi->writeopen = 0; +    wakeup(&pi->nread); +  } else { +    pi->readopen = 0; +    wakeup(&pi->nwrite); +  } +  if(pi->readopen == 0 && pi->writeopen == 0){ +    release(&pi->lock); +    kfree((char*)pi); +  } else +    release(&pi->lock); +} + +int +pipewrite(struct pipe *pi, uint64 addr, int n) +{ +  int i; +  char ch; +  struct proc *pr = myproc(); + +  acquire(&pi->lock); +  for(i = 0; i < n; i++){ +    while(pi->nwrite == pi->nread + PIPESIZE){  //DOC: pipewrite-full +      if(pi->readopen == 0 || myproc()->killed){ +        release(&pi->lock); +        return -1; +      } +      wakeup(&pi->nread); +      sleep(&pi->nwrite, &pi->lock);  //DOC: pipewrite-sleep +    } +    if(copyin(pr->pagetable, &ch, addr + i, 1) == -1) +      break; +    pi->data[pi->nwrite++ % PIPESIZE] = ch; +  } +  wakeup(&pi->nread);  //DOC: pipewrite-wakeup1 +  release(&pi->lock); +  return n; +} + +int +piperead(struct pipe *pi, uint64 addr, int n) +{ +  int i; +  struct proc *pr = myproc(); +  char ch; + +  acquire(&pi->lock); +  while(pi->nread == pi->nwrite && pi->writeopen){  //DOC: pipe-empty +    if(myproc()->killed){ +      release(&pi->lock); +      return -1; +    } +    sleep(&pi->nread, &pi->lock); //DOC: piperead-sleep +  } +  for(i = 0; i < n; i++){  //DOC: piperead-copy +    if(pi->nread == pi->nwrite) +      break; +    ch = pi->data[pi->nread++ % PIPESIZE]; +    if(copyout(pr->pagetable, addr + i, &ch, 1) == -1) +      break; +  } +  wakeup(&pi->nwrite);  //DOC: piperead-wakeup +  release(&pi->lock); +  return i; +} diff --git a/kernel/plic.c b/kernel/plic.c new file mode 100644 index 0000000..b569492 --- /dev/null +++ b/kernel/plic.c @@ -0,0 +1,62 @@ +#include "types.h" +#include "param.h" +#include "memlayout.h" +#include "riscv.h" +#include "defs.h" + +// +// the riscv Platform Level Interrupt Controller (PLIC). +// + +void +plicinit(void) +{ +  // set desired IRQ priorities non-zero (otherwise disabled). +  *(uint32*)(PLIC + UART0_IRQ*4) = 1; +  *(uint32*)(PLIC + VIRTIO0_IRQ*4) = 1; +} + +void +plicinithart(void) +{ +  int hart = cpuid(); +   +  // set uart's enable bit for this hart's S-mode.  +  *(uint32*)PLIC_SENABLE(hart)= (1 << UART0_IRQ) | (1 << VIRTIO0_IRQ); + +  // set this hart's S-mode priority threshold to 0. +  *(uint32*)PLIC_SPRIORITY(hart) = 0; +} + +// return a bitmap of which IRQs are waiting +// to be served. +uint64 +plic_pending(void) +{ +  uint64 mask; + +  //mask = *(uint32*)(PLIC + 0x1000); +  //mask |= (uint64)*(uint32*)(PLIC + 0x1004) << 32; +  mask = *(uint64*)PLIC_PENDING; + +  return mask; +} + +// ask the PLIC what interrupt we should serve. +int +plic_claim(void) +{ +  int hart = cpuid(); +  //int irq = *(uint32*)(PLIC + 0x201004); +  int irq = *(uint32*)PLIC_SCLAIM(hart); +  return irq; +} + +// tell the PLIC we've served this IRQ. +void +plic_complete(int irq) +{ +  int hart = cpuid(); +  //*(uint32*)(PLIC + 0x201004) = irq; +  *(uint32*)PLIC_SCLAIM(hart) = irq; +} diff --git a/kernel/printf.c b/kernel/printf.c new file mode 100644 index 0000000..777cc5f --- /dev/null +++ b/kernel/printf.c @@ -0,0 +1,134 @@ +// +// formatted console output -- printf, panic. +// + +#include <stdarg.h> + +#include "types.h" +#include "param.h" +#include "spinlock.h" +#include "sleeplock.h" +#include "fs.h" +#include "file.h" +#include "memlayout.h" +#include "riscv.h" +#include "defs.h" +#include "proc.h" + +volatile int panicked = 0; + +// lock to avoid interleaving concurrent printf's. +static struct { +  struct spinlock lock; +  int locking; +} pr; + +static char digits[] = "0123456789abcdef"; + +static void +printint(int xx, int base, int sign) +{ +  char buf[16]; +  int i; +  uint x; + +  if(sign && (sign = xx < 0)) +    x = -xx; +  else +    x = xx; + +  i = 0; +  do { +    buf[i++] = digits[x % base]; +  } while((x /= base) != 0); + +  if(sign) +    buf[i++] = '-'; + +  while(--i >= 0) +    consputc(buf[i]); +} + +static void +printptr(uint64 x) +{ +  int i; +  consputc('0'); +  consputc('x'); +  for (i = 0; i < (sizeof(uint64) * 2); i++, x <<= 4) +    consputc(digits[x >> (sizeof(uint64) * 8 - 4)]); +} + +// Print to the console. only understands %d, %x, %p, %s. +void +printf(char *fmt, ...) +{ +  va_list ap; +  int i, c, locking; +  char *s; + +  locking = pr.locking; +  if(locking) +    acquire(&pr.lock); + +  if (fmt == 0) +    panic("null fmt"); + +  va_start(ap, fmt); +  for(i = 0; (c = fmt[i] & 0xff) != 0; i++){ +    if(c != '%'){ +      consputc(c); +      continue; +    } +    c = fmt[++i] & 0xff; +    if(c == 0) +      break; +    switch(c){ +    case 'd': +      printint(va_arg(ap, int), 10, 1); +      break; +    case 'x': +      printint(va_arg(ap, int), 16, 1); +      break; +    case 'p': +      printptr(va_arg(ap, uint64)); +      break; +    case 's': +      if((s = va_arg(ap, char*)) == 0) +        s = "(null)"; +      for(; *s; s++) +        consputc(*s); +      break; +    case '%': +      consputc('%'); +      break; +    default: +      // Print unknown % sequence to draw attention. +      consputc('%'); +      consputc(c); +      break; +    } +  } + +  if(locking) +    release(&pr.lock); +} + +void +panic(char *s) +{ +  pr.locking = 0; +  printf("panic: "); +  printf(s); +  printf("\n"); +  panicked = 1; // freeze other CPUs +  for(;;) +    ; +} + +void +printfinit(void) +{ +  initlock(&pr.lock, "pr"); +  pr.locking = 1; +} diff --git a/kernel/proc.c b/kernel/proc.c new file mode 100644 index 0000000..3d65b46 --- /dev/null +++ b/kernel/proc.c @@ -0,0 +1,647 @@ +#include "types.h" +#include "param.h" +#include "memlayout.h" +#include "riscv.h" +#include "spinlock.h" +#include "proc.h" +#include "defs.h" + +struct cpu cpus[NCPU]; + +struct proc proc[NPROC]; + +struct proc *initproc; + +int nextpid = 1; +struct spinlock pid_lock; + +extern void forkret(void); +static void wakeup1(struct proc *chan); + +extern char trampoline[]; // trampoline.S + +void +procinit(void) +{ +  struct proc *p; +   +  initlock(&pid_lock, "nextpid"); +  for(p = proc; p < &proc[NPROC]; p++) { +      initlock(&p->lock, "proc"); + +      // Allocate a page for the process's kernel stack. +      // Map it high in memory, followed by an invalid +      // guard page. +      char *pa = kalloc(); +      if(pa == 0) +        panic("kalloc"); +      uint64 va = KSTACK((int) (p - proc)); +      kvmmap(va, (uint64)pa, PGSIZE, PTE_R | PTE_W); +      p->kstack = va; +  } +  kvminithart(); +} + +// Must be called with interrupts disabled, +// to prevent race with process being moved +// to a different CPU. +int +cpuid() +{ +  int id = r_tp(); +  return id; +} + +// Return this CPU's cpu struct. +// Interrupts must be disabled. +struct cpu* +mycpu(void) { +  int id = cpuid(); +  struct cpu *c = &cpus[id]; +  return c; +} + +// Return the current struct proc *, or zero if none. +struct proc* +myproc(void) { +  push_off(); +  struct cpu *c = mycpu(); +  struct proc *p = c->proc; +  pop_off(); +  return p; +} + +int +allocpid() { +  int pid; +   +  acquire(&pid_lock); +  pid = nextpid; +  nextpid = nextpid + 1; +  release(&pid_lock); + +  return pid; +} + +// Look in the process table for an UNUSED proc. +// If found, initialize state required to run in the kernel, +// and return with p->lock held. +// If there are no free procs, return 0. +static struct proc* +allocproc(void) +{ +  struct proc *p; + +  for(p = proc; p < &proc[NPROC]; p++) { +    acquire(&p->lock); +    if(p->state == UNUSED) { +      goto found; +    } else { +      release(&p->lock); +    } +  } +  return 0; + +found: +  p->pid = allocpid(); + +  // Allocate a trapframe page. +  if((p->tf = (struct trapframe *)kalloc()) == 0){ +    release(&p->lock); +    return 0; +  } + +  // An empty user page table. +  p->pagetable = proc_pagetable(p); + +  // Set up new context to start executing at forkret, +  // which returns to user space. +  memset(&p->context, 0, sizeof p->context); +  p->context.ra = (uint64)forkret; +  p->context.sp = p->kstack + PGSIZE; + +  return p; +} + +// free a proc structure and the data hanging from it, +// including user pages. +// p->lock must be held. +static void +freeproc(struct proc *p) +{ +  if(p->tf) +    kfree((void*)p->tf); +  p->tf = 0; +  if(p->pagetable) +    proc_freepagetable(p->pagetable, p->sz); +  p->pagetable = 0; +  p->sz = 0; +  p->pid = 0; +  p->parent = 0; +  p->name[0] = 0; +  p->chan = 0; +  p->killed = 0; +  p->state = UNUSED; +} + +// Create a page table for a given process, +// with no user pages, but with trampoline pages. +pagetable_t +proc_pagetable(struct proc *p) +{ +  pagetable_t pagetable; + +  // An empty page table. +  pagetable = uvmcreate(); + +  // map the trampoline code (for system call return) +  // at the highest user virtual address. +  // only the supervisor uses it, on the way +  // to/from user space, so not PTE_U. +  mappages(pagetable, TRAMPOLINE, PGSIZE, +           (uint64)trampoline, PTE_R | PTE_X); + +  // map the trapframe just below TRAMPOLINE, for trampoline.S. +  mappages(pagetable, TRAPFRAME, PGSIZE, +           (uint64)(p->tf), PTE_R | PTE_W); + +  return pagetable; +} + +// Free a process's page table, and free the +// physical memory it refers to. +void +proc_freepagetable(pagetable_t pagetable, uint64 sz) +{ +  uvmunmap(pagetable, TRAMPOLINE, PGSIZE, 0); +  uvmunmap(pagetable, TRAPFRAME, PGSIZE, 0); +  if(sz > 0) +    uvmfree(pagetable, sz); +} + +// a user program that calls exec("/init") +// od -t xC initcode +uchar initcode[] = { +  0x17, 0x05, 0x00, 0x00, 0x13, 0x05, 0x05, 0x02, +  0x97, 0x05, 0x00, 0x00, 0x93, 0x85, 0x05, 0x02, +  0x9d, 0x48, 0x73, 0x00, 0x00, 0x00, 0x89, 0x48, +  0x73, 0x00, 0x00, 0x00, 0xef, 0xf0, 0xbf, 0xff, +  0x2f, 0x69, 0x6e, 0x69, 0x74, 0x00, 0x00, 0x01, +  0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +  0x00, 0x00, 0x00 +}; + +// Set up first user process. +void +userinit(void) +{ +  struct proc *p; + +  p = allocproc(); +  initproc = p; +   +  // allocate one user page and copy init's instructions +  // and data into it. +  uvminit(p->pagetable, initcode, sizeof(initcode)); +  p->sz = PGSIZE; + +  // prepare for the very first "return" from kernel to user. +  p->tf->epc = 0;      // user program counter +  p->tf->sp = PGSIZE;  // user stack pointer + +  safestrcpy(p->name, "initcode", sizeof(p->name)); +  p->cwd = namei("/"); + +  p->state = RUNNABLE; + +  release(&p->lock); +} + +// Grow or shrink user memory by n bytes. +// Return 0 on success, -1 on failure. +int +growproc(int n) +{ +  uint sz; +  struct proc *p = myproc(); + +  sz = p->sz; +  if(n > 0){ +    if((sz = uvmalloc(p->pagetable, sz, sz + n)) == 0) { +      return -1; +    } +  } else if(n < 0){ +    if((sz = uvmdealloc(p->pagetable, sz, sz + n)) == 0) { +      return -1; +    } +  } +  p->sz = sz; +  return 0; +} + +// Create a new process, copying the parent. +// Sets up child kernel stack to return as if from fork() system call. +int +fork(void) +{ +  int i, pid; +  struct proc *np; +  struct proc *p = myproc(); + +  // Allocate process. +  if((np = allocproc()) == 0){ +    return -1; +  } + +  // Copy user memory from parent to child. +  if(uvmcopy(p->pagetable, np->pagetable, p->sz) < 0){ +    freeproc(np); +    release(&np->lock); +    return -1; +  } +  np->sz = p->sz; + +  np->parent = p; + +  // copy saved user registers. +  *(np->tf) = *(p->tf); + +  // Cause fork to return 0 in the child. +  np->tf->a0 = 0; + +  // increment reference counts on open file descriptors. +  for(i = 0; i < NOFILE; i++) +    if(p->ofile[i]) +      np->ofile[i] = filedup(p->ofile[i]); +  np->cwd = idup(p->cwd); + +  safestrcpy(np->name, p->name, sizeof(p->name)); + +  pid = np->pid; + +  np->state = RUNNABLE; + +  release(&np->lock); + +  return pid; +} + +// Pass p's abandoned children to init. +// Caller must hold p->lock and parent->lock. +void +reparent(struct proc *p, struct proc *parent) { +  struct proc *pp; +  int child_of_init = (p->parent == initproc); + +  for(pp = proc; pp < &proc[NPROC]; pp++){ +    // this code uses pp->parent without holding pp->lock. +    // acquiring the lock first could cause a deadlock +    // if pp or a child of pp were also in exit() +    // and about to try to lock p. +    if(pp->parent == p){ +      // pp->parent can't change between the check and the acquire() +      // because only the parent changes it, and we're the parent. +      acquire(&pp->lock); +      pp->parent = initproc; +      if(pp->state == ZOMBIE) { +        if(!child_of_init) +          acquire(&initproc->lock); +        wakeup1(initproc); +        if(!child_of_init) +          release(&initproc->lock); +      } +      release(&pp->lock); +    } +  } +} + +// Exit the current process.  Does not return. +// An exited process remains in the zombie state +// until its parent calls wait(). +void +exit(void) +{ +  struct proc *p = myproc(); + +  if(p == initproc) +    panic("init exiting"); + +  // Close all open files. +  for(int fd = 0; fd < NOFILE; fd++){ +    if(p->ofile[fd]){ +      struct file *f = p->ofile[fd]; +      fileclose(f); +      p->ofile[fd] = 0; +    } +  } + +  begin_op(); +  iput(p->cwd); +  end_op(); +  p->cwd = 0; + +  acquire(&p->parent->lock); +     +  acquire(&p->lock); + +  // Give any children to init. +  reparent(p, p->parent); + +  // Parent might be sleeping in wait(). +  wakeup1(p->parent); + +  p->state = ZOMBIE; + +  release(&p->parent->lock); + +  // Jump into the scheduler, never to return. +  sched(); +  panic("zombie exit"); +} + +// Wait for a child process to exit and return its pid. +// Return -1 if this process has no children. +int +wait(void) +{ +  struct proc *np; +  int havekids, pid; +  struct proc *p = myproc(); + +  // hold p->lock for the whole time to avoid lost +  // wakeups from a child's exit(). +  acquire(&p->lock); + +  for(;;){ +    // Scan through table looking for exited children. +    havekids = 0; +    for(np = proc; np < &proc[NPROC]; np++){ +      // this code uses np->parent without holding np->lock. +      // acquiring the lock first would cause a deadlock, +      // since np might be an ancestor, and we already hold p->lock. +      if(np->parent == p){ +        // np->parent can't change between the check and the acquire() +        // because only the parent changes it, and we're the parent. +        acquire(&np->lock); +        havekids = 1; +        if(np->state == ZOMBIE){ +          // Found one. +          pid = np->pid; +          freeproc(np); +          release(&np->lock); +          release(&p->lock); +          return pid; +        } +        release(&np->lock); +      } +    } + +    // No point waiting if we don't have any children. +    if(!havekids || p->killed){ +      release(&p->lock); +      return -1; +    } +     +    // Wait for a child to exit. +    sleep(p, &p->lock);  //DOC: wait-sleep +  } +} + +// Per-CPU process scheduler. +// Each CPU calls scheduler() after setting itself up. +// Scheduler never returns.  It loops, doing: +//  - choose a process to run. +//  - swtch to start running that process. +//  - eventually that process transfers control +//    via swtch back to the scheduler. +void +scheduler(void) +{ +  struct proc *p; +  struct cpu *c = mycpu(); +   +  c->proc = 0; +  for(;;){ +    // Avoid deadlock by ensuring that devices can interrupt. +    intr_on(); + +    for(p = proc; p < &proc[NPROC]; p++) { +      acquire(&p->lock); +      if(p->state == RUNNABLE) { +        // Switch to chosen process.  It is the process's job +        // to release its lock and then reacquire it +        // before jumping back to us. +        p->state = RUNNING; +        c->proc = p; +        swtch(&c->scheduler, &p->context); + +        // Process is done running for now. +        // It should have changed its p->state before coming back. +        c->proc = 0; +      } +      release(&p->lock); +    } +  } +} + +// Switch to scheduler.  Must hold only p->lock +// and have changed proc->state. Saves and restores +// intena because intena is a property of this +// kernel thread, not this CPU. It should +// be proc->intena and proc->noff, but that would +// break in the few places where a lock is held but +// there's no process. +void +sched(void) +{ +  int intena; +  struct proc *p = myproc(); + +  if(!holding(&p->lock)) +    panic("sched p->lock"); +  if(mycpu()->noff != 1) +    panic("sched locks"); +  if(p->state == RUNNING) +    panic("sched running"); +  if(intr_get()) +    panic("sched interruptible"); + +  intena = mycpu()->intena; +  swtch(&p->context, &mycpu()->scheduler); +  mycpu()->intena = intena; +} + +// Give up the CPU for one scheduling round. +void +yield(void) +{ +  struct proc *p = myproc(); +  acquire(&p->lock);  //DOC: yieldlock +  p->state = RUNNABLE; +  sched(); +  release(&p->lock); +} + +// A fork child's very first scheduling by scheduler() +// will swtch to forkret. +void +forkret(void) +{ +  static int first = 1; + +  // Still holding p->lock from scheduler. +  release(&myproc()->lock); + +  if (first) { +    // File system initialization must be run in the context of a +    // regular process (e.g., because it calls sleep), and thus cannot +    // be run from main(). +    first = 0; +    fsinit(ROOTDEV); +  } + +  usertrapret(); +} + +// Atomically release lock and sleep on chan. +// Reacquires lock when awakened. +void +sleep(void *chan, struct spinlock *lk) +{ +  struct proc *p = myproc(); +   +  // Must acquire p->lock in order to +  // change p->state and then call sched. +  // Once we hold p->lock, we can be +  // guaranteed that we won't miss any wakeup +  // (wakeup locks p->lock), +  // so it's okay to release lk. +  if(lk != &p->lock){  //DOC: sleeplock0 +    acquire(&p->lock);  //DOC: sleeplock1 +    release(lk); +  } + +  // Go to sleep. +  p->chan = chan; +  p->state = SLEEPING; + +  sched(); + +  // Tidy up. +  p->chan = 0; + +  // Reacquire original lock. +  if(lk != &p->lock){  //DOC: sleeplock2 +    release(&p->lock); +    acquire(lk); +  } +} + +// Wake up all processes sleeping on chan. +// Must be called without any p->lock. +void +wakeup(void *chan) +{ +  struct proc *p; + +  for(p = proc; p < &proc[NPROC]; p++) { +    acquire(&p->lock); +    if(p->state == SLEEPING && p->chan == chan) { +      p->state = RUNNABLE; +    } +    release(&p->lock); +  } +} + +// Wake up p if it is sleeping in wait(); used by exit(). +// Caller must hold p->lock. +static void +wakeup1(struct proc *p) +{ +  if(p->chan == p && p->state == SLEEPING) { +    p->state = RUNNABLE; +  } +} + +// Kill the process with the given pid. +// The victim won't exit until it tries to return +// to user space (see usertrap() in trap.c). +int +kill(int pid) +{ +  struct proc *p; + +  for(p = proc; p < &proc[NPROC]; p++){ +    acquire(&p->lock); +    if(p->pid == pid){ +      p->killed = 1; +      if(p->state == SLEEPING){ +        // Wake process from sleep(). +        p->state = RUNNABLE; +      } +      release(&p->lock); +      return 0; +    } +    release(&p->lock); +  } +  return -1; +} + +// Copy to either a user address, or kernel address, +// depending on usr_dst. +// Returns 0 on success, -1 on error. +int +either_copyout(int user_dst, uint64 dst, void *src, uint64 len) +{ +  struct proc *p = myproc(); +  if(user_dst){ +    return copyout(p->pagetable, dst, src, len); +  } else { +    memmove((char *)dst, src, len); +    return 0; +  } +} + +// Copy from either a user address, or kernel address, +// depending on usr_src. +// Returns 0 on success, -1 on error. +int +either_copyin(void *dst, int user_src, uint64 src, uint64 len) +{ +  struct proc *p = myproc(); +  if(user_src){ +    return copyin(p->pagetable, dst, src, len); +  } else { +    memmove(dst, (char*)src, len); +    return 0; +  } +} + +// Print a process listing to console.  For debugging. +// Runs when user types ^P on console. +// No lock to avoid wedging a stuck machine further. +void +procdump(void) +{ +  static char *states[] = { +  [UNUSED]    "unused", +  [SLEEPING]  "sleep ", +  [RUNNABLE]  "runble", +  [RUNNING]   "run   ", +  [ZOMBIE]    "zombie" +  }; +  struct proc *p; +  char *state; + +  printf("\n"); +  for(p = proc; p < &proc[NPROC]; p++){ +    if(p->state == UNUSED) +      continue; +    if(p->state >= 0 && p->state < NELEM(states) && states[p->state]) +      state = states[p->state]; +    else +      state = "???"; +    printf("%d %s %s", p->pid, state, p->name); +    printf("\n"); +  } +} diff --git a/kernel/proc.h b/kernel/proc.h new file mode 100644 index 0000000..655d79f --- /dev/null +++ b/kernel/proc.h @@ -0,0 +1,105 @@ +// Saved registers for kernel context switches. +struct context { +  uint64 ra; +  uint64 sp; + +  // callee-saved +  uint64 s0; +  uint64 s1; +  uint64 s2; +  uint64 s3; +  uint64 s4; +  uint64 s5; +  uint64 s6; +  uint64 s7; +  uint64 s8; +  uint64 s9; +  uint64 s10; +  uint64 s11; +}; + +// Per-CPU state. +struct cpu { +  struct proc *proc;          // The process running on this cpu, or null. +  struct context scheduler;   // swtch() here to enter scheduler(). +  int noff;                   // Depth of push_off() nesting. +  int intena;                 // Were interrupts enabled before push_off()? +}; + +extern struct cpu cpus[NCPU]; + +// per-process data for the trap handling code in trampoline.S. +// sits in a page by itself just under the trampoline page in the +// user page table. not specially mapped in the kernel page table. +// the sscratch register points here. +// uservec in trampoline.S saves user registers in the trapframe, +// then initializes registers from the trapframe's +// kernel_sp, kernel_hartid, kernel_satp, and jumps to kernel_trap. +// usertrapret() and userret in trampoline.S set up +// the trapframe's kernel_*, restore user registers from the +// trapframe, switch to the user page table, and enter user space. +// the trapframe includes callee-saved user registers like s0-s11 because the +// return-to-user path via usertrapret() doesn't return through +// the entire kernel call stack. +struct trapframe { +  /*   0 */ uint64 kernel_satp;   // kernel page table +  /*   8 */ uint64 kernel_sp;     // top of process's kernel stack +  /*  16 */ uint64 kernel_trap;   // usertrap() +  /*  24 */ uint64 epc;           // saved user program counter +  /*  32 */ uint64 kernel_hartid; // saved kernel tp +  /*  40 */ uint64 ra; +  /*  48 */ uint64 sp; +  /*  56 */ uint64 gp; +  /*  64 */ uint64 tp; +  /*  72 */ uint64 t0; +  /*  80 */ uint64 t1; +  /*  88 */ uint64 t2; +  /*  96 */ uint64 s0; +  /* 104 */ uint64 s1; +  /* 112 */ uint64 a0; +  /* 120 */ uint64 a1; +  /* 128 */ uint64 a2; +  /* 136 */ uint64 a3; +  /* 144 */ uint64 a4; +  /* 152 */ uint64 a5; +  /* 160 */ uint64 a6; +  /* 168 */ uint64 a7; +  /* 176 */ uint64 s2; +  /* 184 */ uint64 s3; +  /* 192 */ uint64 s4; +  /* 200 */ uint64 s5; +  /* 208 */ uint64 s6; +  /* 216 */ uint64 s7; +  /* 224 */ uint64 s8; +  /* 232 */ uint64 s9; +  /* 240 */ uint64 s10; +  /* 248 */ uint64 s11; +  /* 256 */ uint64 t3; +  /* 264 */ uint64 t4; +  /* 272 */ uint64 t5; +  /* 280 */ uint64 t6; +}; + +enum procstate { UNUSED, SLEEPING, RUNNABLE, RUNNING, ZOMBIE }; + +// Per-process state +struct proc { +  struct spinlock lock; + +  // p->lock must be held when using these: +  enum procstate state;        // Process state +  struct proc *parent;         // Parent process +  void *chan;                  // If non-zero, sleeping on chan +  int killed;                  // If non-zero, have been killed +  int pid;                     // Process ID + +  // these are private to the process, so p->lock need not be held. +  uint64 kstack;               // Bottom of kernel stack for this process +  uint64 sz;                   // Size of process memory (bytes) +  pagetable_t pagetable;       // Page table +  struct trapframe *tf;        // data page for trampoline.S +  struct context context;      // swtch() here to run process +  struct file *ofile[NOFILE];  // Open files +  struct inode *cwd;           // Current directory +  char name[16];               // Process name (debugging) +}; diff --git a/kernel/ramdisk.c b/kernel/ramdisk.c new file mode 100644 index 0000000..9901294 --- /dev/null +++ b/kernel/ramdisk.c @@ -0,0 +1,45 @@ +// +// ramdisk that uses the disk image loaded by qemu -rdinit fs.img +// + +#include "types.h" +#include "riscv.h" +#include "defs.h" +#include "param.h" +#include "memlayout.h" +#include "spinlock.h" +#include "sleeplock.h" +#include "fs.h" +#include "buf.h" + +void +ramdiskinit(void) +{ +} + +// If B_DIRTY is set, write buf to disk, clear B_DIRTY, set B_VALID. +// Else if B_VALID is not set, read buf from disk, set B_VALID. +void +ramdiskrw(struct buf *b) +{ +  if(!holdingsleep(&b->lock)) +    panic("ramdiskrw: buf not locked"); +  if((b->flags & (B_VALID|B_DIRTY)) == B_VALID) +    panic("ramdiskrw: nothing to do"); + +  if(b->blockno >= FSSIZE) +    panic("ramdiskrw: blockno too big"); + +  uint64 diskaddr = b->blockno * BSIZE; +  char *addr = (char *)RAMDISK + diskaddr; + +  if(b->flags & B_DIRTY){ +    // write +    memmove(addr, b->data, BSIZE); +    b->flags &= ~B_DIRTY; +  } else { +    // read +    memmove(b->data, addr, BSIZE); +    b->flags |= B_VALID; +  } +} diff --git a/kernel/riscv.h b/kernel/riscv.h new file mode 100644 index 0000000..0f83db6 --- /dev/null +++ b/kernel/riscv.h @@ -0,0 +1,358 @@ +// which hart (core) is this? +static inline uint64 +r_mhartid() +{ +  uint64 x; +  asm volatile("csrr %0, mhartid" : "=r" (x) ); +  return x; +} + +// Machine Status Register, mstatus + +#define MSTATUS_MPP_MASK (3L << 11) // previous mode. +#define MSTATUS_MPP_M (3L << 11) +#define MSTATUS_MPP_S (1L << 11) +#define MSTATUS_MPP_U (0L << 11) +#define MSTATUS_MIE (1L << 3)    // machine-mode interrupt enable. + +static inline uint64 +r_mstatus() +{ +  uint64 x; +  asm volatile("csrr %0, mstatus" : "=r" (x) ); +  return x; +} + +static inline void  +w_mstatus(uint64 x) +{ +  asm volatile("csrw mstatus, %0" : : "r" (x)); +} + +// machine exception program counter, holds the +// instruction address to which a return from +// exception will go. +static inline void  +w_mepc(uint64 x) +{ +  asm volatile("csrw mepc, %0" : : "r" (x)); +} + +// Supervisor Status Register, sstatus + +#define SSTATUS_SPP (1L << 8)  // Previous mode, 1=Supervisor, 0=User +#define SSTATUS_SPIE (1L << 5) // Supervisor Previous Interrupt Enable +#define SSTATUS_UPIE (1L << 4) // User Previous Interrupt Enable +#define SSTATUS_SIE (1L << 1)  // Supervisor Interrupt Enable +#define SSTATUS_UIE (1L << 0)  // User Interrupt Enable + +static inline uint64 +r_sstatus() +{ +  uint64 x; +  asm volatile("csrr %0, sstatus" : "=r" (x) ); +  return x; +} + +static inline void  +w_sstatus(uint64 x) +{ +  asm volatile("csrw sstatus, %0" : : "r" (x)); +} + +// Supervisor Interrupt Pending +static inline uint64 +r_sip() +{ +  uint64 x; +  asm volatile("csrr %0, sip" : "=r" (x) ); +  return x; +} + +static inline void  +w_sip(uint64 x) +{ +  asm volatile("csrw sip, %0" : : "r" (x)); +} + +// Supervisor Interrupt Enable +#define SIE_SEIE (1L << 9) // external +#define SIE_STIE (1L << 5) // timer +#define SIE_SSIE (1L << 1) // software +static inline uint64 +r_sie() +{ +  uint64 x; +  asm volatile("csrr %0, sie" : "=r" (x) ); +  return x; +} + +static inline void  +w_sie(uint64 x) +{ +  asm volatile("csrw sie, %0" : : "r" (x)); +} + +// Machine-mode Interrupt Enable +#define MIE_MEIE (1L << 11) // external +#define MIE_MTIE (1L << 7)  // timer +#define MIE_MSIE (1L << 3)  // software +static inline uint64 +r_mie() +{ +  uint64 x; +  asm volatile("csrr %0, mie" : "=r" (x) ); +  return x; +} + +static inline void  +w_mie(uint64 x) +{ +  asm volatile("csrw mie, %0" : : "r" (x)); +} + +// machine exception program counter, holds the +// instruction address to which a return from +// exception will go. +static inline void  +w_sepc(uint64 x) +{ +  asm volatile("csrw sepc, %0" : : "r" (x)); +} + +static inline uint64 +r_sepc() +{ +  uint64 x; +  asm volatile("csrr %0, sepc" : "=r" (x) ); +  return x; +} + +// Machine Exception Delegation +static inline uint64 +r_medeleg() +{ +  uint64 x; +  asm volatile("csrr %0, medeleg" : "=r" (x) ); +  return x; +} + +static inline void  +w_medeleg(uint64 x) +{ +  asm volatile("csrw medeleg, %0" : : "r" (x)); +} + +// Machine Interrupt Delegation +static inline uint64 +r_mideleg() +{ +  uint64 x; +  asm volatile("csrr %0, mideleg" : "=r" (x) ); +  return x; +} + +static inline void  +w_mideleg(uint64 x) +{ +  asm volatile("csrw mideleg, %0" : : "r" (x)); +} + +// Supervisor Trap-Vector Base Address +// low two bits are mode. +static inline void  +w_stvec(uint64 x) +{ +  asm volatile("csrw stvec, %0" : : "r" (x)); +} + +static inline uint64 +r_stvec() +{ +  uint64 x; +  asm volatile("csrr %0, stvec" : "=r" (x) ); +  return x; +} + +// Machine-mode interrupt vector +static inline void  +w_mtvec(uint64 x) +{ +  asm volatile("csrw mtvec, %0" : : "r" (x)); +} + +// use riscv's sv39 page table scheme. +#define SATP_SV39 (8L << 60) + +#define MAKE_SATP(pagetable) (SATP_SV39 | (((uint64)pagetable) >> 12)) + +// supervisor address translation and protection; +// holds the address of the page table. +static inline void  +w_satp(uint64 x) +{ +  asm volatile("csrw satp, %0" : : "r" (x)); +} + +static inline uint64 +r_satp() +{ +  uint64 x; +  asm volatile("csrr %0, satp" : "=r" (x) ); +  return x; +} + +// Supervisor Scratch register, for early trap handler in trampoline.S. +static inline void  +w_sscratch(uint64 x) +{ +  asm volatile("csrw sscratch, %0" : : "r" (x)); +} + +static inline void  +w_mscratch(uint64 x) +{ +  asm volatile("csrw mscratch, %0" : : "r" (x)); +} + +// Supervisor Trap Cause +static inline uint64 +r_scause() +{ +  uint64 x; +  asm volatile("csrr %0, scause" : "=r" (x) ); +  return x; +} + +// Supervisor Trap Value +static inline uint64 +r_stval() +{ +  uint64 x; +  asm volatile("csrr %0, stval" : "=r" (x) ); +  return x; +} + +// Machine-mode Counter-Enable +static inline void  +w_mcounteren(uint64 x) +{ +  asm volatile("csrw mcounteren, %0" : : "r" (x)); +} + +static inline uint64 +r_mcounteren() +{ +  uint64 x; +  asm volatile("csrr %0, mcounteren" : "=r" (x) ); +  return x; +} + +// machine-mode cycle counter +static inline uint64 +r_time() +{ +  uint64 x; +  asm volatile("csrr %0, time" : "=r" (x) ); +  return x; +} + +// enable device interrupts +static inline void +intr_on() +{ +  w_sie(r_sie() | SIE_SEIE | SIE_STIE | SIE_SSIE); +  w_sstatus(r_sstatus() | SSTATUS_SIE); +} + +// disable device interrupts +static inline void +intr_off() +{ +  w_sstatus(r_sstatus() & ~SSTATUS_SIE); +} + +// are device interrupts enabled? +static inline int +intr_get() +{ +  uint64 x = r_sstatus(); +  return (x & SSTATUS_SIE) != 0; +} + +static inline uint64 +r_sp() +{ +  uint64 x; +  asm volatile("mv %0, sp" : "=r" (x) ); +  return x; +} + +// read and write tp, the thread pointer, which holds +// this core's hartid (core number), the index into cpus[]. +static inline uint64 +r_tp() +{ +  uint64 x; +  asm volatile("mv %0, tp" : "=r" (x) ); +  return x; +} + +static inline void  +w_tp(uint64 x) +{ +  asm volatile("mv tp, %0" : : "r" (x)); +} + +static inline uint64 +r_ra() +{ +  uint64 x; +  asm volatile("mv %0, ra" : "=r" (x) ); +  return x; +} + +// tell the machine to finish any previous writes to +// PTEs, so that a subsequent use of a virtual +// address or load of the SATP will see those writes. +// perhaps this also flushes the TLB. +static inline void +sfence_vma() +{ +  // the zero, zero means flush all TLB entries. +  asm volatile("sfence.vma zero, zero"); +} + + +#define PGSIZE 4096 // bytes per page +#define PGSHIFT 12  // bits of offset within a page + +#define PGROUNDUP(sz)  (((sz)+PGSIZE-1) & ~(PGSIZE-1)) +#define PGROUNDDOWN(a) (((a)) & ~(PGSIZE-1)) + +#define PTE_V (1L << 0) // valid +#define PTE_R (1L << 1) +#define PTE_W (1L << 2) +#define PTE_X (1L << 3) +#define PTE_U (1L << 4) // 1 -> user can access + +// shift a physical address to the right place for a PTE. +#define PA2PTE(pa) ((((uint64)pa) >> 12) << 10) + +#define PTE2PA(pte) (((pte) >> 10) << 12) + +#define PTE_FLAGS(pte) ((pte) & (PTE_V|PTE_R|PTE_W|PTE_X|PTE_U)) + +// extract the three 9-bit page table indices from a virtual address. +#define PXMASK          0x1FF // 9 bits +#define PXSHIFT(level)  (PGSHIFT+(9*(level))) +#define PX(level, va) ((((uint64) (va)) >> PXSHIFT(level)) & PXMASK) + +// one beyond the highest possible virtual address. +// MAXVA is actually one bit less than the max allowed by +// Sv39, to avoid having to sign-extend virtual addresses +// that have the high bit set. +#define MAXVA (1L << (9 + 9 + 9 + 12 - 1)) + +typedef uint64 pte_t; +typedef uint64 *pagetable_t; // 512 PTEs diff --git a/kernel/sleeplock.c b/kernel/sleeplock.c new file mode 100644 index 0000000..81de585 --- /dev/null +++ b/kernel/sleeplock.c @@ -0,0 +1,55 @@ +// Sleeping locks + +#include "types.h" +#include "riscv.h" +#include "defs.h" +#include "param.h" +#include "memlayout.h" +#include "spinlock.h" +#include "proc.h" +#include "sleeplock.h" + +void +initsleeplock(struct sleeplock *lk, char *name) +{ +  initlock(&lk->lk, "sleep lock"); +  lk->name = name; +  lk->locked = 0; +  lk->pid = 0; +} + +void +acquiresleep(struct sleeplock *lk) +{ +  acquire(&lk->lk); +  while (lk->locked) { +    sleep(lk, &lk->lk); +  } +  lk->locked = 1; +  lk->pid = myproc()->pid; +  release(&lk->lk); +} + +void +releasesleep(struct sleeplock *lk) +{ +  acquire(&lk->lk); +  lk->locked = 0; +  lk->pid = 0; +  wakeup(lk); +  release(&lk->lk); +} + +int +holdingsleep(struct sleeplock *lk) +{ +  int r; +   +  acquire(&lk->lk); +  r = lk->locked && (lk->pid == myproc()->pid); +  release(&lk->lk); +  return r; +} + + + diff --git a/kernel/sleeplock.h b/kernel/sleeplock.h new file mode 100644 index 0000000..110e6f3 --- /dev/null +++ b/kernel/sleeplock.h @@ -0,0 +1,10 @@ +// Long-term locks for processes +struct sleeplock { +  uint locked;       // Is the lock held? +  struct spinlock lk; // spinlock protecting this sleep lock +   +  // For debugging: +  char *name;        // Name of lock. +  int pid;           // Process holding lock +}; + diff --git a/kernel/spinlock.c b/kernel/spinlock.c new file mode 100644 index 0000000..563532e --- /dev/null +++ b/kernel/spinlock.c @@ -0,0 +1,108 @@ +// Mutual exclusion spin locks. + +#include "types.h" +#include "param.h" +#include "memlayout.h" +#include "spinlock.h" +#include "riscv.h" +#include "proc.h" +#include "defs.h" + +void +initlock(struct spinlock *lk, char *name) +{ +  lk->name = name; +  lk->locked = 0; +  lk->cpu = 0; +} + +// Acquire the lock. +// Loops (spins) until the lock is acquired. +void +acquire(struct spinlock *lk) +{ +  push_off(); // disable interrupts to avoid deadlock. +  if(holding(lk)) +    panic("acquire"); + +  // On RISC-V, sync_lock_test_and_set turns into an atomic swap: +  //   a5 = 1 +  //   s1 = &lk->locked +  //   amoswap.w.aq a5, a5, (s1) +  while(__sync_lock_test_and_set(&lk->locked, 1) != 0) +    ; + +  // Tell the C compiler and the processor to not move loads or stores +  // past this point, to ensure that the critical section's memory +  // references happen after the lock is acquired. +  __sync_synchronize(); + +  // Record info about lock acquisition for holding() and debugging. +  lk->cpu = mycpu(); +} + +// Release the lock. +void +release(struct spinlock *lk) +{ +  if(!holding(lk)) +    panic("release"); + +  lk->cpu = 0; + +  // Tell the C compiler and the CPU to not move loads or stores +  // past this point, to ensure that all the stores in the critical +  // section are visible to other CPUs before the lock is released. +  // On RISC-V, this turns into a fence instruction. +  __sync_synchronize(); + +  // Release the lock, equivalent to lk->locked = 0. +  // This code doesn't use a C assignment, since the C standard +  // implies that an assignment might be implemented with +  // multiple store instructions. +  // On RISC-V, sync_lock_release turns into an atomic swap: +  //   s1 = &lk->locked +  //   amoswap.w zero, zero, (s1) +  __sync_lock_release(&lk->locked); + +  pop_off(); +} + +// Check whether this cpu is holding the lock. +int +holding(struct spinlock *lk) +{ +  int r; +  push_off(); +  r = (lk->locked && lk->cpu == mycpu()); +  pop_off(); +  return r; +} + +// push_off/pop_off are like intr_off()/intr_on() except that they are matched: +// it takes two pop_off()s to undo two push_off()s.  Also, if interrupts +// are initially off, then push_off, pop_off leaves them off. + +void +push_off(void) +{ +  int old = intr_get(); + +  intr_off(); +  if(mycpu()->noff == 0) +    mycpu()->intena = old; +  mycpu()->noff += 1; +} + +void +pop_off(void) +{ +  struct cpu *c = mycpu(); +  if(intr_get()) +    panic("pop_off - interruptible"); +  c->noff -= 1; +  if(c->noff < 0) +    panic("pop_off"); +  if(c->noff == 0 && c->intena) +    intr_on(); +} diff --git a/kernel/spinlock.h b/kernel/spinlock.h new file mode 100644 index 0000000..4392820 --- /dev/null +++ b/kernel/spinlock.h @@ -0,0 +1,9 @@ +// Mutual exclusion lock. +struct spinlock { +  uint locked;       // Is the lock held? + +  // For debugging: +  char *name;        // Name of lock. +  struct cpu *cpu;   // The cpu holding the lock. +}; + diff --git a/kernel/start.c b/kernel/start.c new file mode 100644 index 0000000..203c5e6 --- /dev/null +++ b/kernel/start.c @@ -0,0 +1,82 @@ +#include "types.h" +#include "param.h" +#include "memlayout.h" +#include "riscv.h" +#include "defs.h" + +void main(); +void timerinit(); + +// entry.S needs one stack per CPU. +__attribute__ ((aligned (16))) char stack0[4096 * NCPU]; + +// scratch area for timer interrupt, one per CPU. +uint64 mscratch0[NCPU * 32]; + +// assembly code in kernelvec.S for machine-mode timer interrupt. +extern void timervec(); + +// entry.S jumps here in machine mode on stack0. +void +start() +{ +  // set M Previous Privilege mode to Supervisor, for mret. +  unsigned long x = r_mstatus(); +  x &= ~MSTATUS_MPP_MASK; +  x |= MSTATUS_MPP_S; +  w_mstatus(x); + +  // set M Exception Program Counter to main, for mret. +  // requires gcc -mcmodel=medany +  w_mepc((uint64)main); + +  // disable paging for now. +  w_satp(0); + +  // delegate all interrupts and exceptions to supervisor mode. +  w_medeleg(0xffff); +  w_mideleg(0xffff); + +  // ask for clock interrupts. +  timerinit(); + +  // keep each CPU's hartid in its tp register, for cpuid(). +  int id = r_mhartid(); +  w_tp(id); + +  // switch to supervisor mode and jump to main(). +  asm volatile("mret"); +} + +// set up to receive timer interrupts in machine mode, +// which arrive at timervec in kernelvec.S, +// which turns them into software interrupts for +// devintr() in trap.c. +void +timerinit() +{ +  // each CPU has a separate source of timer interrupts. +  int id = r_mhartid(); + +  // ask the CLINT for a timer interrupt. +  int interval = 1000000; // cycles; about 1/10th second in qemu. +  *(uint64*)CLINT_MTIMECMP(id) = *(uint64*)CLINT_MTIME + interval; + +  // prepare information in scratch[] for timervec. +  // scratch[0..3] : space for timervec to save registers. +  // scratch[4] : address of CLINT MTIMECMP register. +  // scratch[5] : desired interval (in cycles) between timer interrupts. +  uint64 *scratch = &mscratch0[32 * id]; +  scratch[4] = CLINT_MTIMECMP(id); +  scratch[5] = interval; +  w_mscratch((uint64)scratch); + +  // set the machine-mode trap handler. +  w_mtvec((uint64)timervec); + +  // enable machine-mode interrupts. +  w_mstatus(r_mstatus() | MSTATUS_MIE); + +  // enable machine-mode timer interrupts. +  w_mie(r_mie() | MIE_MTIE); +} diff --git a/kernel/stat.h b/kernel/stat.h new file mode 100644 index 0000000..19543af --- /dev/null +++ b/kernel/stat.h @@ -0,0 +1,11 @@ +#define T_DIR     1   // Directory +#define T_FILE    2   // File +#define T_DEVICE  3   // Device + +struct stat { +  int dev;     // File system's disk device +  uint ino;    // Inode number +  short type;  // Type of file +  short nlink; // Number of links to file +  uint64 size; // Size of file in bytes +}; diff --git a/kernel/string.c b/kernel/string.c new file mode 100644 index 0000000..d99e612 --- /dev/null +++ b/kernel/string.c @@ -0,0 +1,104 @@ +#include "types.h" + +void* +memset(void *dst, int c, uint n) +{ +  char *cdst = (char *) dst; +  int i; +  for(i = 0; i < n; i++){ +    cdst[i] = c; +  } +  return dst; +} + +int +memcmp(const void *v1, const void *v2, uint n) +{ +  const uchar *s1, *s2; + +  s1 = v1; +  s2 = v2; +  while(n-- > 0){ +    if(*s1 != *s2) +      return *s1 - *s2; +    s1++, s2++; +  } + +  return 0; +} + +void* +memmove(void *dst, const void *src, uint n) +{ +  const char *s; +  char *d; + +  s = src; +  d = dst; +  if(s < d && s + n > d){ +    s += n; +    d += n; +    while(n-- > 0) +      *--d = *--s; +  } else +    while(n-- > 0) +      *d++ = *s++; + +  return dst; +} + +// memcpy exists to placate GCC.  Use memmove. +void* +memcpy(void *dst, const void *src, uint n) +{ +  return memmove(dst, src, n); +} + +int +strncmp(const char *p, const char *q, uint n) +{ +  while(n > 0 && *p && *p == *q) +    n--, p++, q++; +  if(n == 0) +    return 0; +  return (uchar)*p - (uchar)*q; +} + +char* +strncpy(char *s, const char *t, int n) +{ +  char *os; + +  os = s; +  while(n-- > 0 && (*s++ = *t++) != 0) +    ; +  while(n-- > 0) +    *s++ = 0; +  return os; +} + +// Like strncpy but guaranteed to NUL-terminate. +char* +safestrcpy(char *s, const char *t, int n) +{ +  char *os; + +  os = s; +  if(n <= 0) +    return os; +  while(--n > 0 && (*s++ = *t++) != 0) +    ; +  *s = 0; +  return os; +} + +int +strlen(const char *s) +{ +  int n; + +  for(n = 0; s[n]; n++) +    ; +  return n; +} + diff --git a/kernel/swtch.S b/kernel/swtch.S new file mode 100644 index 0000000..17a8663 --- /dev/null +++ b/kernel/swtch.S @@ -0,0 +1,42 @@ +# Context switch +# +#   void swtch(struct context *old, struct context *new); +#  +# Save current registers in old. Load from new.	 + + +.globl swtch +swtch: +        sd ra, 0(a0) +        sd sp, 8(a0) +        sd s0, 16(a0) +        sd s1, 24(a0) +        sd s2, 32(a0) +        sd s3, 40(a0) +        sd s4, 48(a0) +        sd s5, 56(a0) +        sd s6, 64(a0) +        sd s7, 72(a0) +        sd s8, 80(a0) +        sd s9, 88(a0) +        sd s10, 96(a0) +        sd s11, 104(a0) + +        ld ra, 0(a1) +        ld sp, 8(a1) +        ld s0, 16(a1) +        ld s1, 24(a1) +        ld s2, 32(a1) +        ld s3, 40(a1) +        ld s4, 48(a1) +        ld s5, 56(a1) +        ld s6, 64(a1) +        ld s7, 72(a1) +        ld s8, 80(a1) +        ld s9, 88(a1) +        ld s10, 96(a1) +        ld s11, 104(a1) +         +        ret + +	 diff --git a/kernel/syscall.c b/kernel/syscall.c new file mode 100644 index 0000000..97974d6 --- /dev/null +++ b/kernel/syscall.c @@ -0,0 +1,147 @@ +#include "types.h" +#include "param.h" +#include "memlayout.h" +#include "riscv.h" +#include "spinlock.h" +#include "proc.h" +#include "syscall.h" +#include "defs.h" + +// Fetch the uint64 at addr from the current process. +int +fetchaddr(uint64 addr, uint64 *ip) +{ +  struct proc *p = myproc(); +  if(addr >= p->sz || addr+sizeof(uint64) > p->sz) +    return -1; +  if(copyin(p->pagetable, (char *)ip, addr, sizeof(*ip)) != 0) +    return -1; +  return 0; +} + +// Fetch the nul-terminated string at addr from the current process. +// Doesn't actually copy the string - just sets *pp to point at it. +// Returns length of string, not including nul, or -1 for error. +int +fetchstr(uint64 addr, char *buf, int max) +{ +  struct proc *p = myproc(); +  int err = copyinstr(p->pagetable, buf, addr, max); +  if(err < 0) +    return err; +  return strlen(buf); +} + +static uint64 +argraw(int n) +{ +  struct proc *p = myproc(); +  switch (n) { +  case 0: +    return p->tf->a0; +  case 1: +    return p->tf->a1; +  case 2: +    return p->tf->a2; +  case 3: +    return p->tf->a3; +  case 4: +    return p->tf->a4; +  case 5: +    return p->tf->a5; +  } +  panic("argraw"); +  return -1; +} + +// Fetch the nth 32-bit system call argument. +int +argint(int n, int *ip) +{ +  *ip = argraw(n); +  return 0; +} + +// Retrieve an argument as a pointer. +// Doesn't check for legality, since +// copyin/copyout will do that. +int +argaddr(int n, uint64 *ip) +{ +  *ip = argraw(n); +  return 0; +} + +// Fetch the nth word-sized system call argument as a null-terminated string. +// Copies into buf, at most max. +// Returns string length if OK (including nul), -1 if error. +int +argstr(int n, char *buf, int max) +{ +  uint64 addr; +  if(argaddr(n, &addr) < 0) +    return -1; +  return fetchstr(addr, buf, max); +} + +extern uint64 sys_chdir(void); +extern uint64 sys_close(void); +extern uint64 sys_dup(void); +extern uint64 sys_exec(void); +extern uint64 sys_exit(void); +extern uint64 sys_fork(void); +extern uint64 sys_fstat(void); +extern uint64 sys_getpid(void); +extern uint64 sys_kill(void); +extern uint64 sys_link(void); +extern uint64 sys_mkdir(void); +extern uint64 sys_mknod(void); +extern uint64 sys_open(void); +extern uint64 sys_pipe(void); +extern uint64 sys_read(void); +extern uint64 sys_sbrk(void); +extern uint64 sys_sleep(void); +extern uint64 sys_unlink(void); +extern uint64 sys_wait(void); +extern uint64 sys_write(void); +extern uint64 sys_uptime(void); + +static uint64 (*syscalls[])(void) = { +[SYS_fork]    sys_fork, +[SYS_exit]    sys_exit, +[SYS_wait]    sys_wait, +[SYS_pipe]    sys_pipe, +[SYS_read]    sys_read, +[SYS_kill]    sys_kill, +[SYS_exec]    sys_exec, +[SYS_fstat]   sys_fstat, +[SYS_chdir]   sys_chdir, +[SYS_dup]     sys_dup, +[SYS_getpid]  sys_getpid, +[SYS_sbrk]    sys_sbrk, +[SYS_sleep]   sys_sleep, +[SYS_uptime]  sys_uptime, +[SYS_open]    sys_open, +[SYS_write]   sys_write, +[SYS_mknod]   sys_mknod, +[SYS_unlink]  sys_unlink, +[SYS_link]    sys_link, +[SYS_mkdir]   sys_mkdir, +[SYS_close]   sys_close, +}; + +void +syscall(void) +{ +  int num; +  struct proc *p = myproc(); + +  num = p->tf->a7; +  if(num > 0 && num < NELEM(syscalls) && syscalls[num]) { +    p->tf->a0 = syscalls[num](); +  } else { +    printf("%d %s: unknown sys call %d\n", +            p->pid, p->name, num); +    p->tf->a0 = -1; +  } +} diff --git a/kernel/syscall.h b/kernel/syscall.h new file mode 100644 index 0000000..bc5f356 --- /dev/null +++ b/kernel/syscall.h @@ -0,0 +1,22 @@ +// System call numbers +#define SYS_fork    1 +#define SYS_exit    2 +#define SYS_wait    3 +#define SYS_pipe    4 +#define SYS_read    5 +#define SYS_kill    6 +#define SYS_exec    7 +#define SYS_fstat   8 +#define SYS_chdir   9 +#define SYS_dup    10 +#define SYS_getpid 11 +#define SYS_sbrk   12 +#define SYS_sleep  13 +#define SYS_uptime 14 +#define SYS_open   15 +#define SYS_write  16 +#define SYS_mknod  17 +#define SYS_unlink 18 +#define SYS_link   19 +#define SYS_mkdir  20 +#define SYS_close  21 diff --git a/kernel/sysfile.c b/kernel/sysfile.c new file mode 100644 index 0000000..23a9540 --- /dev/null +++ b/kernel/sysfile.c @@ -0,0 +1,476 @@ +// +// File-system system calls. +// Mostly argument checking, since we don't trust +// user code, and calls into file.c and fs.c. +// + +#include "types.h" +#include "riscv.h" +#include "defs.h" +#include "param.h" +#include "stat.h" +#include "spinlock.h" +#include "proc.h" +#include "fs.h" +#include "sleeplock.h" +#include "file.h" +#include "fcntl.h" + +// Fetch the nth word-sized system call argument as a file descriptor +// and return both the descriptor and the corresponding struct file. +static int +argfd(int n, int *pfd, struct file **pf) +{ +  int fd; +  struct file *f; + +  if(argint(n, &fd) < 0) +    return -1; +  if(fd < 0 || fd >= NOFILE || (f=myproc()->ofile[fd]) == 0) +    return -1; +  if(pfd) +    *pfd = fd; +  if(pf) +    *pf = f; +  return 0; +} + +// Allocate a file descriptor for the given file. +// Takes over file reference from caller on success. +static int +fdalloc(struct file *f) +{ +  int fd; +  struct proc *p = myproc(); + +  for(fd = 0; fd < NOFILE; fd++){ +    if(p->ofile[fd] == 0){ +      p->ofile[fd] = f; +      return fd; +    } +  } +  return -1; +} + +uint64 +sys_dup(void) +{ +  struct file *f; +  int fd; + +  if(argfd(0, 0, &f) < 0) +    return -1; +  if((fd=fdalloc(f)) < 0) +    return -1; +  filedup(f); +  return fd; +} + +uint64 +sys_read(void) +{ +  struct file *f; +  int n; +  uint64 p; + +  if(argfd(0, 0, &f) < 0 || argint(2, &n) < 0 || argaddr(1, &p) < 0) +    return -1; +  return fileread(f, p, n); +} + +uint64 +sys_write(void) +{ +  struct file *f; +  int n; +  uint64 p; + +  if(argfd(0, 0, &f) < 0 || argint(2, &n) < 0 || argaddr(1, &p) < 0) +    return -1; + +  return filewrite(f, p, n); +} + +uint64 +sys_close(void) +{ +  int fd; +  struct file *f; + +  if(argfd(0, &fd, &f) < 0) +    return -1; +  myproc()->ofile[fd] = 0; +  fileclose(f); +  return 0; +} + +uint64 +sys_fstat(void) +{ +  struct file *f; +  uint64 st; // user pointer to struct stat + +  if(argfd(0, 0, &f) < 0 || argaddr(1, &st) < 0) +    return -1; +  return filestat(f, st); +} + +// Create the path new as a link to the same inode as old. +uint64 +sys_link(void) +{ +  char name[DIRSIZ], new[MAXPATH], old[MAXPATH]; +  struct inode *dp, *ip; + +  if(argstr(0, old, MAXPATH) < 0 || argstr(1, new, MAXPATH) < 0) +    return -1; + +  begin_op(); +  if((ip = namei(old)) == 0){ +    end_op(); +    return -1; +  } + +  ilock(ip); +  if(ip->type == T_DIR){ +    iunlockput(ip); +    end_op(); +    return -1; +  } + +  ip->nlink++; +  iupdate(ip); +  iunlock(ip); + +  if((dp = nameiparent(new, name)) == 0) +    goto bad; +  ilock(dp); +  if(dp->dev != ip->dev || dirlink(dp, name, ip->inum) < 0){ +    iunlockput(dp); +    goto bad; +  } +  iunlockput(dp); +  iput(ip); + +  end_op(); + +  return 0; + +bad: +  ilock(ip); +  ip->nlink--; +  iupdate(ip); +  iunlockput(ip); +  end_op(); +  return -1; +} + +// Is the directory dp empty except for "." and ".." ? +static int +isdirempty(struct inode *dp) +{ +  int off; +  struct dirent de; + +  for(off=2*sizeof(de); off<dp->size; off+=sizeof(de)){ +    if(readi(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de)) +      panic("isdirempty: readi"); +    if(de.inum != 0) +      return 0; +  } +  return 1; +} + +uint64 +sys_unlink(void) +{ +  struct inode *ip, *dp; +  struct dirent de; +  char name[DIRSIZ], path[MAXPATH]; +  uint off; + +  if(argstr(0, path, MAXPATH) < 0) +    return -1; + +  begin_op(); +  if((dp = nameiparent(path, name)) == 0){ +    end_op(); +    return -1; +  } + +  ilock(dp); + +  // Cannot unlink "." or "..". +  if(namecmp(name, ".") == 0 || namecmp(name, "..") == 0) +    goto bad; + +  if((ip = dirlookup(dp, name, &off)) == 0) +    goto bad; +  ilock(ip); + +  if(ip->nlink < 1) +    panic("unlink: nlink < 1"); +  if(ip->type == T_DIR && !isdirempty(ip)){ +    iunlockput(ip); +    goto bad; +  } + +  memset(&de, 0, sizeof(de)); +  if(writei(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de)) +    panic("unlink: writei"); +  if(ip->type == T_DIR){ +    dp->nlink--; +    iupdate(dp); +  } +  iunlockput(dp); + +  ip->nlink--; +  iupdate(ip); +  iunlockput(ip); + +  end_op(); + +  return 0; + +bad: +  iunlockput(dp); +  end_op(); +  return -1; +} + +static struct inode* +create(char *path, short type, short major, short minor) +{ +  struct inode *ip, *dp; +  char name[DIRSIZ]; + +  if((dp = nameiparent(path, name)) == 0) +    return 0; +  ilock(dp); + +  if((ip = dirlookup(dp, name, 0)) != 0){ +    iunlockput(dp); +    ilock(ip); +    if(type == T_FILE && (ip->type == T_FILE || ip->type == T_DEVICE)) +      return ip; +    iunlockput(ip); +    return 0; +  } + +  if((ip = ialloc(dp->dev, type)) == 0) +    panic("create: ialloc"); + +  ilock(ip); +  ip->major = major; +  ip->minor = minor; +  ip->nlink = 1; +  iupdate(ip); + +  if(type == T_DIR){  // Create . and .. entries. +    dp->nlink++;  // for ".." +    iupdate(dp); +    // No ip->nlink++ for ".": avoid cyclic ref count. +    if(dirlink(ip, ".", ip->inum) < 0 || dirlink(ip, "..", dp->inum) < 0) +      panic("create dots"); +  } + +  if(dirlink(dp, name, ip->inum) < 0) +    panic("create: dirlink"); + +  iunlockput(dp); + +  return ip; +} + +uint64 +sys_open(void) +{ +  char path[MAXPATH]; +  int fd, omode; +  struct file *f; +  struct inode *ip; + +  if(argstr(0, path, MAXPATH) < 0 || argint(1, &omode) < 0) +    return -1; + +  begin_op(); + +  if(omode & O_CREATE){ +    ip = create(path, T_FILE, 0, 0); +    if(ip == 0){ +      end_op(); +      return -1; +    } +  } else { +    if((ip = namei(path)) == 0){ +      end_op(); +      return -1; +    } +    ilock(ip); +    if(ip->type == T_DIR && omode != O_RDONLY){ +      iunlockput(ip); +      end_op(); +      return -1; +    } +  } + +  if(ip->type == T_DEVICE && (ip->major < 0 || ip->major >= NDEV)){ +    iunlockput(ip); +    end_op(); +    return -1; +  } + +  if((f = filealloc()) == 0 || (fd = fdalloc(f)) < 0){ +    if(f) +      fileclose(f); +    iunlockput(ip); +    end_op(); +    return -1; +  } + +  if(ip->type == T_DEVICE){ +    f->type = FD_DEVICE; +    f->major = ip->major; +  } else { +    f->type = FD_INODE; +    f->off = 0; +  } +  f->ip = ip; +  f->readable = !(omode & O_WRONLY); +  f->writable = (omode & O_WRONLY) || (omode & O_RDWR); + +  iunlock(ip); +  end_op(); + +  return fd; +} + +uint64 +sys_mkdir(void) +{ +  char path[MAXPATH]; +  struct inode *ip; + +  begin_op(); +  if(argstr(0, path, MAXPATH) < 0 || (ip = create(path, T_DIR, 0, 0)) == 0){ +    end_op(); +    return -1; +  } +  iunlockput(ip); +  end_op(); +  return 0; +} + +uint64 +sys_mknod(void) +{ +  struct inode *ip; +  char path[MAXPATH]; +  int major, minor; + +  begin_op(); +  if((argstr(0, path, MAXPATH)) < 0 || +     argint(1, &major) < 0 || +     argint(2, &minor) < 0 || +     (ip = create(path, T_DEVICE, major, minor)) == 0){ +    end_op(); +    return -1; +  } +  iunlockput(ip); +  end_op(); +  return 0; +} + +uint64 +sys_chdir(void) +{ +  char path[MAXPATH]; +  struct inode *ip; +  struct proc *p = myproc(); +   +  begin_op(); +  if(argstr(0, path, MAXPATH) < 0 || (ip = namei(path)) == 0){ +    end_op(); +    return -1; +  } +  ilock(ip); +  if(ip->type != T_DIR){ +    iunlockput(ip); +    end_op(); +    return -1; +  } +  iunlock(ip); +  iput(p->cwd); +  end_op(); +  p->cwd = ip; +  return 0; +} + +uint64 +sys_exec(void) +{ +  char path[MAXPATH], *argv[MAXARG]; +  int i; +  uint64 uargv, uarg; + +  if(argstr(0, path, MAXPATH) < 0 || argaddr(1, &uargv) < 0){ +    return -1; +  } +  memset(argv, 0, sizeof(argv)); +  for(i=0;; i++){ +    if(i >= NELEM(argv)){ +      return -1; +    } +    if(fetchaddr(uargv+sizeof(uint64)*i, (uint64*)&uarg) < 0){ +      return -1; +    } +    if(uarg == 0){ +      argv[i] = 0; +      break; +    } +    argv[i] = kalloc(); +    if(argv[i] == 0) +      panic("sys_exec kalloc"); +    if(fetchstr(uarg, argv[i], PGSIZE) < 0){ +      return -1; +    } +  } + +  int ret = exec(path, argv); + +  for(i = 0; i < NELEM(argv) && argv[i] != 0; i++) +    kfree(argv[i]); + +  return ret; +} + +uint64 +sys_pipe(void) +{ +  uint64 fdarray; // user pointer to array of two integers +  struct file *rf, *wf; +  int fd0, fd1; +  struct proc *p = myproc(); + +  if(argaddr(0, &fdarray) < 0) +    return -1; +  if(pipealloc(&rf, &wf) < 0) +    return -1; +  fd0 = -1; +  if((fd0 = fdalloc(rf)) < 0 || (fd1 = fdalloc(wf)) < 0){ +    if(fd0 >= 0) +      p->ofile[fd0] = 0; +    fileclose(rf); +    fileclose(wf); +    return -1; +  } +  if(copyout(p->pagetable, fdarray, (char*)&fd0, sizeof(fd0)) < 0 || +     copyout(p->pagetable, fdarray+sizeof(fd0), (char *)&fd1, sizeof(fd1)) < 0){ +    p->ofile[fd0] = 0; +    p->ofile[fd1] = 0; +    fileclose(rf); +    fileclose(wf); +    return -1; +  } +  return 0; +} diff --git a/kernel/sysproc.c b/kernel/sysproc.c new file mode 100644 index 0000000..face81a --- /dev/null +++ b/kernel/sysproc.c @@ -0,0 +1,91 @@ +#include "types.h" +#include "riscv.h" +#include "defs.h" +#include "date.h" +#include "param.h" +#include "memlayout.h" +#include "spinlock.h" +#include "proc.h" + +uint64 +sys_exit(void) +{ +  exit(); +  return 0;  // not reached +} + +uint64 +sys_getpid(void) +{ +  return myproc()->pid; +} + +uint64 +sys_fork(void) +{ +  return fork(); +} + +uint64 +sys_wait(void) +{ +  return wait(); +} + +uint64 +sys_sbrk(void) +{ +  int addr; +  int n; + +  if(argint(0, &n) < 0) +    return -1; +  addr = myproc()->sz; +  if(growproc(n) < 0) +    return -1; +  return addr; +} + +uint64 +sys_sleep(void) +{ +  int n; +  uint ticks0; + +  if(argint(0, &n) < 0) +    return -1; +  acquire(&tickslock); +  ticks0 = ticks; +  while(ticks - ticks0 < n){ +    if(myproc()->killed){ +      release(&tickslock); +      return -1; +    } +    sleep(&ticks, &tickslock); +  } +  release(&tickslock); +  return 0; +} + +uint64 +sys_kill(void) +{ +  int pid; + +  if(argint(0, &pid) < 0) +    return -1; +  return kill(pid); +} + +// return how many clock tick interrupts have occurred +// since start. +uint64 +sys_uptime(void) +{ +  uint xticks; + +  acquire(&tickslock); +  xticks = ticks; +  release(&tickslock); +  return xticks; +} diff --git a/kernel/trampoline.S b/kernel/trampoline.S new file mode 100644 index 0000000..24499d9 --- /dev/null +++ b/kernel/trampoline.S @@ -0,0 +1,141 @@ +	# +        # code to switch between user and kernel space. +        # +        # this code is mapped at the same virtual address +        # (TRAMPOLINE) in user and kernel space so that +        # it continues to work when it switches page tables. +	# +	# kernel.ld causes this to be aligned +        # to a page boundary. +        # +	.section trampsec +.globl trampoline +trampoline: +.align 4 +.globl uservec +uservec:     +	# +        # trap.c sets stvec to point here, so +        # traps from user space start here, +        # in supervisor mode, but with a +        # user page table. +        # +        # sscratch points to where the process's p->tf is +        # mapped into user space, at TRAPFRAME. +        # +         +	# swap a0 and sscratch +        # so that a0 is TRAPFRAME +        csrrw a0, sscratch, a0 + +        # save the user registers in TRAPFRAME +        sd ra, 40(a0) +        sd sp, 48(a0) +        sd gp, 56(a0) +        sd tp, 64(a0) +        sd t0, 72(a0) +        sd t1, 80(a0) +        sd t2, 88(a0) +        sd s0, 96(a0) +        sd s1, 104(a0) +        sd a1, 120(a0) +        sd a2, 128(a0) +        sd a3, 136(a0) +        sd a4, 144(a0) +        sd a5, 152(a0) +        sd a6, 160(a0) +        sd a7, 168(a0) +        sd s2, 176(a0) +        sd s3, 184(a0) +        sd s4, 192(a0) +        sd s5, 200(a0) +        sd s6, 208(a0) +        sd s7, 216(a0) +        sd s8, 224(a0) +        sd s9, 232(a0) +        sd s10, 240(a0) +        sd s11, 248(a0) +        sd t3, 256(a0) +        sd t4, 264(a0) +        sd t5, 272(a0) +        sd t6, 280(a0) + +	# save the user a0 in p->tf->a0 +        csrr t0, sscratch +        sd t0, 112(a0) + +        # restore kernel stack pointer from p->tf->kernel_sp +        ld sp, 8(a0) + +        # make tp hold the current hartid, from p->tf->kernel_hartid +        ld tp, 32(a0) + +        # remember the address of usertrap(), p->tf->kernel_trap +        ld t0, 16(a0) + +        # restore kernel page table from p->tf->kernel_satp +        ld t1, 0(a0) +	sfence.vma zero, zero +        csrw satp, t1 + +        # a0 is no longer valid, since the kernel page +        # table does not specially map p->td. + +        # jump to usertrap(), which does not return +        jr t0 + +.globl userret +userret: +        # userret(TRAPFRAME, pagetable) +        # switch from kernel to user. +        # usertrapret() calls here. +	# a0: TRAPFRAME, in user page table +        # a1: user page table, for satp + +	# switch to the user page table. +	sfence.vma zero, zero +        csrw satp, a1 + +        # put the saved user a0 in sscratch, so we +        # can swap it with our a0 (TRAPFRAME) in the last step. +        ld t0, 112(a0) +        csrw sscratch, t0 + +        # restore all but a0 from TRAPFRAME +        ld ra, 40(a0) +        ld sp, 48(a0) +        ld gp, 56(a0) +        ld tp, 64(a0) +        ld t0, 72(a0) +        ld t1, 80(a0) +        ld t2, 88(a0) +        ld s0, 96(a0) +        ld s1, 104(a0) +        ld a1, 120(a0) +        ld a2, 128(a0) +        ld a3, 136(a0) +        ld a4, 144(a0) +        ld a5, 152(a0) +        ld a6, 160(a0) +        ld a7, 168(a0) +        ld s2, 176(a0) +        ld s3, 184(a0) +        ld s4, 192(a0) +        ld s5, 200(a0) +        ld s6, 208(a0) +        ld s7, 216(a0) +        ld s8, 224(a0) +        ld s9, 232(a0) +        ld s10, 240(a0) +        ld s11, 248(a0) +        ld t3, 256(a0) +        ld t4, 264(a0) +        ld t5, 272(a0) +        ld t6, 280(a0) + +	# restore user a0, and save TRAPFRAME in sscratch +        csrrw a0, sscratch, a0 +         +        # return to user mode and user pc. +        # usertrapret() set up sstatus and sepc. +        sret diff --git a/kernel/trap.c b/kernel/trap.c new file mode 100644 index 0000000..ec57bed --- /dev/null +++ b/kernel/trap.c @@ -0,0 +1,213 @@ +#include "types.h" +#include "param.h" +#include "memlayout.h" +#include "riscv.h" +#include "spinlock.h" +#include "proc.h" +#include "defs.h" + +struct spinlock tickslock; +uint ticks; + +extern char trampoline[], uservec[], userret[]; + +// in kernelvec.S, calls kerneltrap(). +void kernelvec(); + +extern int devintr(); + +void +trapinit(void) +{ +  initlock(&tickslock, "time"); +} + +// set up to take exceptions and traps while in the kernel. +void +trapinithart(void) +{ +  w_stvec((uint64)kernelvec); +} + +// +// handle an interrupt, exception, or system call from user space. +// called from trampoline.S +// +void +usertrap(void) +{ +  int which_dev = 0; + +  if((r_sstatus() & SSTATUS_SPP) != 0) +    panic("usertrap: not from user mode"); + +  // send interrupts and exceptions to kerneltrap(), +  // since we're now in the kernel. +  w_stvec((uint64)kernelvec); + +  struct proc *p = myproc(); +   +  // save user program counter. +  p->tf->epc = r_sepc(); +   +  if(r_scause() == 8){ +    // system call + +    if(p->killed) +      exit(); + +    // sepc points to the ecall instruction, +    // but we want to return to the next instruction. +    p->tf->epc += 4; + +    // an interrupt will change sstatus &c registers, +    // so don't enable until done with those registers. +    intr_on(); + +    syscall(); +  } else if((which_dev = devintr()) != 0){ +    // ok +  } else { +    printf("usertrap(): unexpected scause %p pid=%d\n", r_scause(), p->pid); +    printf("            sepc=%p stval=%p\n", r_sepc(), r_stval()); +    p->killed = 1; +  } + +  if(p->killed) +    exit(); + +  // give up the CPU if this is a timer interrupt. +  if(which_dev == 2) +    yield(); + +  usertrapret(); +} + +// +// return to user space +// +void +usertrapret(void) +{ +  struct proc *p = myproc(); + +  // turn off interrupts, since we're switching +  // now from kerneltrap() to usertrap(). +  intr_off(); + +  // send interrupts and exceptions to trampoline.S +  w_stvec(TRAMPOLINE + (uservec - trampoline)); + +  // set up values that uservec will need when +  // the process next re-enters the kernel. +  p->tf->kernel_satp = r_satp();         // kernel page table +  p->tf->kernel_sp = p->kstack + PGSIZE; // process's kernel stack +  p->tf->kernel_trap = (uint64)usertrap; +  p->tf->kernel_hartid = r_tp();         // hartid for cpuid() + +  // set up the registers that trampoline.S's sret will use +  // to get to user space. +   +  // set S Previous Privilege mode to User. +  unsigned long x = r_sstatus(); +  x &= ~SSTATUS_SPP; // clear SPP to 0 for user mode +  x |= SSTATUS_SPIE; // enable interrupts in user mode +  w_sstatus(x); + +  // set S Exception Program Counter to the saved user pc. +  w_sepc(p->tf->epc); + +  // tell trampoline.S the user page table to switch to. +  uint64 satp = MAKE_SATP(p->pagetable); + +  // jump to trampoline.S at the top of memory, which  +  // switches to the user page table, restores user registers, +  // and switches to user mode with sret. +  uint64 fn = TRAMPOLINE + (userret - trampoline); +  ((void (*)(uint64,uint64))fn)(TRAPFRAME, satp); +} + +// interrupts and exceptions from kernel code go here via kernelvec, +// on whatever the current kernel stack is. +// must be 4-byte aligned to fit in stvec. +void  +kerneltrap() +{ +  int which_dev = 0; +  uint64 sepc = r_sepc(); +  uint64 sstatus = r_sstatus(); +  uint64 scause = r_scause(); +   +  if((sstatus & SSTATUS_SPP) == 0) +    panic("kerneltrap: not from supervisor mode"); +  if(intr_get() != 0) +    panic("kerneltrap: interrupts enabled"); + +  if((which_dev = devintr()) == 0){ +    printf("scause %p\n", scause); +    printf("sepc=%p stval=%p\n", r_sepc(), r_stval()); +    panic("kerneltrap"); +  } + +  // give up the CPU if this is a timer interrupt. +  if(which_dev == 2 && myproc() != 0 && myproc()->state == RUNNING) +    yield(); + +  // the yield() may have caused some traps to occur, +  // so restore trap registers for use by kernelvec.S's sepc instruction. +  w_sepc(sepc); +  w_sstatus(sstatus); +} + +void +clockintr() +{ +  acquire(&tickslock); +  ticks++; +  wakeup(&ticks); +  release(&tickslock); +} + +// check if it's an external interrupt or software interrupt, +// and handle it. +// returns 2 if timer interrupt, +// 1 if other device, +// 0 if not recognized. +int +devintr() +{ +  uint64 scause = r_scause(); + +  if((scause & 0x8000000000000000L) && +     (scause & 0xff) == 9){ +    // this is a supervisor external interrupt, via PLIC. + +    // irq indicates which device interrupted. +    int irq = plic_claim(); + +    if(irq == UART0_IRQ){ +      uartintr(); +    } else if(irq == VIRTIO0_IRQ){ +      virtio_disk_intr(); +    } + +    plic_complete(irq); +    return 1; +  } else if(scause == 0x8000000000000001L){ +    // software interrupt from a machine-mode timer interrupt, +    // forwarded by timervec in kernelvec.S. + +    if(cpuid() == 0){ +      clockintr(); +    } +     +    // acknowledge the software interrupt by clearing +    // the SSIP bit in sip. +    w_sip(r_sip() & ~2); + +    return 2; +  } else { +    return 0; +  } +} + diff --git a/kernel/types.h b/kernel/types.h new file mode 100644 index 0000000..ee73164 --- /dev/null +++ b/kernel/types.h @@ -0,0 +1,10 @@ +typedef unsigned int   uint; +typedef unsigned short ushort; +typedef unsigned char  uchar; + +typedef unsigned char uint8; +typedef unsigned short uint16; +typedef unsigned int  uint32; +typedef unsigned long uint64; + +typedef uint64 pde_t; diff --git a/kernel/uart.c b/kernel/uart.c new file mode 100644 index 0000000..3a5cdc4 --- /dev/null +++ b/kernel/uart.c @@ -0,0 +1,92 @@ +// +// low-level driver routines for 16550a UART. +// + +#include "types.h" +#include "param.h" +#include "memlayout.h" +#include "riscv.h" +#include "spinlock.h" +#include "proc.h" +#include "defs.h" + +// the UART control registers are memory-mapped +// at address UART0. this macro returns the +// address of one of the registers. +#define Reg(reg) ((volatile unsigned char *)(UART0 + reg)) + +// the UART control registers. +// some have different meanings for +// read vs write. +// http://byterunner.com/16550.html +#define RHR 0 // receive holding register (for input bytes) +#define THR 0 // transmit holding register (for output bytes) +#define IER 1 // interrupt enable register +#define FCR 2 // FIFO control register +#define ISR 2 // interrupt status register +#define LCR 3 // line control register +#define LSR 5 // line status register + +#define ReadReg(reg) (*(Reg(reg))) +#define WriteReg(reg, v) (*(Reg(reg)) = (v)) + +void +uartinit(void) +{ +  // disable interrupts. +  WriteReg(IER, 0x00); + +  // special mode to set baud rate. +  WriteReg(LCR, 0x80); + +  // LSB for baud rate of 38.4K. +  WriteReg(0, 0x03); + +  // MSB for baud rate of 38.4K. +  WriteReg(1, 0x00); + +  // leave set-baud mode, +  // and set word length to 8 bits, no parity. +  WriteReg(LCR, 0x03); + +  // reset and enable FIFOs. +  WriteReg(FCR, 0x07); + +  // enable receive interrupts. +  WriteReg(IER, 0x01); +} + +// write one output character to the UART. +void +uartputc(int c) +{ +  // wait for Transmit Holding Empty to be set in LSR. +  while((ReadReg(LSR) & (1 << 5)) == 0) +    ; +  WriteReg(THR, c); +} + +// read one input character from the UART. +// return -1 if none is waiting. +int +uartgetc(void) +{ +  if(ReadReg(LSR) & 0x01){ +    // input data is ready. +    return ReadReg(RHR); +  } else { +    return -1; +  } +} + +// trap.c calls here when the uart interrupts. +void +uartintr(void) +{ +  while(1){ +    int c = uartgetc(); +    if(c == -1) +      break; +    consoleintr(c); +  } +} diff --git a/kernel/virtio.h b/kernel/virtio.h new file mode 100644 index 0000000..03b53a9 --- /dev/null +++ b/kernel/virtio.h @@ -0,0 +1,72 @@ +// +// virtio device definitions. +// for both the mmio interface, and virtio descriptors. +// only tested with qemu. +// this is the "legacy" virtio interface. +// +// the virtio spec: +// https://docs.oasis-open.org/virtio/virtio/v1.1/virtio-v1.1.pdf +// + +// virtio mmio control registers, mapped starting at 0x10001000. +// from qemu virtio_mmio.h +#define VIRTIO_MMIO_MAGIC_VALUE		0x000 // 0x74726976 +#define VIRTIO_MMIO_VERSION		0x004 // version; 1 is legacy +#define VIRTIO_MMIO_DEVICE_ID		0x008 // device type; 1 is net, 2 is disk +#define VIRTIO_MMIO_VENDOR_ID		0x00c // 0x554d4551 +#define VIRTIO_MMIO_DEVICE_FEATURES	0x010 +#define VIRTIO_MMIO_DRIVER_FEATURES	0x020 +#define VIRTIO_MMIO_GUEST_PAGE_SIZE	0x028 // page size for PFN, write-only +#define VIRTIO_MMIO_QUEUE_SEL		0x030 // select queue, write-only +#define VIRTIO_MMIO_QUEUE_NUM_MAX	0x034 // max size of current queue, read-only +#define VIRTIO_MMIO_QUEUE_NUM		0x038 // size of current queue, write-only +#define VIRTIO_MMIO_QUEUE_ALIGN		0x03c // used ring alignment, write-only +#define VIRTIO_MMIO_QUEUE_PFN		0x040 // physical page number for queue, read/write +#define VIRTIO_MMIO_QUEUE_READY		0x044 // ready bit +#define VIRTIO_MMIO_QUEUE_NOTIFY	0x050 // write-only +#define VIRTIO_MMIO_INTERRUPT_STATUS	0x060 // read-only +#define VIRTIO_MMIO_INTERRUPT_ACK	0x064 // write-only +#define VIRTIO_MMIO_STATUS		0x070 // read/write + +// status register bits, from qemu virtio_config.h +#define VIRTIO_CONFIG_S_ACKNOWLEDGE	1 +#define VIRTIO_CONFIG_S_DRIVER		2 +#define VIRTIO_CONFIG_S_DRIVER_OK	4 +#define VIRTIO_CONFIG_S_FEATURES_OK	8 + +// device feature bits +#define VIRTIO_BLK_F_RO              5	/* Disk is read-only */ +#define VIRTIO_BLK_F_SCSI            7	/* Supports scsi command passthru */ +#define VIRTIO_BLK_F_CONFIG_WCE     11	/* Writeback mode available in config */ +#define VIRTIO_BLK_F_MQ             12	/* support more than one vq */ +#define VIRTIO_F_ANY_LAYOUT         27 +#define VIRTIO_RING_F_INDIRECT_DESC 28 +#define VIRTIO_RING_F_EVENT_IDX     29 + +// this many virtio descriptors. +// must be a power of two. +#define NUM 8 + +struct VRingDesc { +  uint64 addr; +  uint32 len; +  uint16 flags; +  uint16 next; +}; +#define VRING_DESC_F_NEXT  1 // chained with another descriptor +#define VRING_DESC_F_WRITE 2 // device writes (vs read) + +struct VRingUsedElem { +  uint32 id;   // index of start of completed descriptor chain +  uint32 len; +}; + +// for disk ops +#define VIRTIO_BLK_T_IN  0 // read the disk +#define VIRTIO_BLK_T_OUT 1 // write the disk + +struct UsedArea { +  uint16 flags; +  uint16 id; +  struct VRingUsedElem elems[NUM]; +}; diff --git a/kernel/virtio_disk.c b/kernel/virtio_disk.c new file mode 100644 index 0000000..3cff024 --- /dev/null +++ b/kernel/virtio_disk.c @@ -0,0 +1,269 @@ +// +// driver for qemu's virtio disk device. +// uses qemu's mmio interface to virtio. +// qemu presents a "legacy" virtio interface. +// +// qemu ... -drive file=fs.img,if=none,format=raw,id=x0 -device virtio-blk-device,drive=x0,bus=virtio-mmio-bus.0 +// + +#include "types.h" +#include "riscv.h" +#include "defs.h" +#include "param.h" +#include "memlayout.h" +#include "spinlock.h" +#include "sleeplock.h" +#include "fs.h" +#include "buf.h" +#include "virtio.h" + +// the address of virtio mmio register r. +#define R(r) ((volatile uint32 *)(VIRTIO0 + (r))) + +static struct disk { + // memory for virtio descriptors &c for queue 0. + // this is a global instead of allocated because it must + // be multiple contiguous pages, which kalloc() + // doesn't support, and page aligned. +  char pages[2*PGSIZE]; +  struct VRingDesc *desc; +  uint16 *avail; +  struct UsedArea *used; + +  // our own book-keeping. +  char free[NUM];  // is a descriptor free? +  uint16 used_idx; // we've looked this far in used[2..NUM]. + +  // track info about in-flight operations, +  // for use when completion interrupt arrives. +  // indexed by first descriptor index of chain. +  struct { +    struct buf *b; +    char status; +  } info[NUM]; +   +  struct spinlock vdisk_lock; +   +} __attribute__ ((aligned (PGSIZE))) disk; + +void +virtio_disk_init(void) +{ +  uint32 status = 0; + +  initlock(&disk.vdisk_lock, "virtio_disk"); + +  if(*R(VIRTIO_MMIO_MAGIC_VALUE) != 0x74726976 || +     *R(VIRTIO_MMIO_VERSION) != 1 || +     *R(VIRTIO_MMIO_DEVICE_ID) != 2 || +     *R(VIRTIO_MMIO_VENDOR_ID) != 0x554d4551){ +    panic("could not find virtio disk"); +  } +   +  status |= VIRTIO_CONFIG_S_ACKNOWLEDGE; +  *R(VIRTIO_MMIO_STATUS) = status; + +  status |= VIRTIO_CONFIG_S_DRIVER; +  *R(VIRTIO_MMIO_STATUS) = status; + +  // negotiate features +  uint64 features = *R(VIRTIO_MMIO_DEVICE_FEATURES); +  features &= ~(1 << VIRTIO_BLK_F_RO); +  features &= ~(1 << VIRTIO_BLK_F_SCSI); +  features &= ~(1 << VIRTIO_BLK_F_CONFIG_WCE); +  features &= ~(1 << VIRTIO_BLK_F_MQ); +  features &= ~(1 << VIRTIO_F_ANY_LAYOUT); +  features &= ~(1 << VIRTIO_RING_F_EVENT_IDX); +  features &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC); +  *R(VIRTIO_MMIO_DRIVER_FEATURES) = features; + +  // tell device that feature negotiation is complete. +  status |= VIRTIO_CONFIG_S_FEATURES_OK; +  *R(VIRTIO_MMIO_STATUS) = status; + +  // tell device we're completely ready. +  status |= VIRTIO_CONFIG_S_DRIVER_OK; +  *R(VIRTIO_MMIO_STATUS) = status; + +  *R(VIRTIO_MMIO_GUEST_PAGE_SIZE) = PGSIZE; + +  // initialize queue 0. +  *R(VIRTIO_MMIO_QUEUE_SEL) = 0; +  uint32 max = *R(VIRTIO_MMIO_QUEUE_NUM_MAX); +  if(max == 0) +    panic("virtio disk has no queue 0"); +  if(max < NUM) +    panic("virtio disk max queue too short"); +  *R(VIRTIO_MMIO_QUEUE_NUM) = NUM; +  memset(disk.pages, 0, sizeof(disk.pages)); +  *R(VIRTIO_MMIO_QUEUE_PFN) = ((uint64)disk.pages) >> PGSHIFT; + +  // desc = pages -- num * VRingDesc +  // avail = pages + 0x40 -- 2 * uint16, then num * uint16 +  // used = pages + 4096 -- 2 * uint16, then num * vRingUsedElem + +  disk.desc = (struct VRingDesc *) disk.pages; +  disk.avail = (uint16*)(((char*)disk.desc) + NUM*sizeof(struct VRingDesc)); +  disk.used = (struct UsedArea *) (disk.pages + PGSIZE); + +  for(int i = 0; i < NUM; i++) +    disk.free[i] = 1; + +  // plic.c and trap.c arrange for interrupts from VIRTIO0_IRQ. +} + +// find a free descriptor, mark it non-free, return its index. +static int +alloc_desc() +{ +  for(int i = 0; i < NUM; i++){ +    if(disk.free[i]){ +      disk.free[i] = 0; +      return i; +    } +  } +  return -1; +} + +// mark a descriptor as free. +static void +free_desc(int i) +{ +  if(i >= NUM) +    panic("virtio_disk_intr 1"); +  if(disk.free[i]) +    panic("virtio_disk_intr 2"); +  disk.desc[i].addr = 0; +  disk.free[i] = 1; +  wakeup(&disk.free[0]); +} + +// free a chain of descriptors. +static void +free_chain(int i) +{ +  while(1){ +    free_desc(i); +    if(disk.desc[i].flags & VRING_DESC_F_NEXT) +      i = disk.desc[i].next; +    else +      break; +  } +} + +static int +alloc3_desc(int *idx) +{ +  for(int i = 0; i < 3; i++){ +    idx[i] = alloc_desc(); +    if(idx[i] < 0){ +      for(int j = 0; j < i; j++) +        free_desc(idx[j]); +      return -1; +    } +  } +  return 0; +} + +void +virtio_disk_rw(struct buf *b, int write) +{ +  uint64 sector = b->blockno * (BSIZE / 512); + +  acquire(&disk.vdisk_lock); + +  // the spec says that legacy block operations use three +  // descriptors: one for type/reserved/sector, one for +  // the data, one for a 1-byte status result. + +  // allocate the three descriptors. +  int idx[3]; +  while(1){ +    if(alloc3_desc(idx) == 0) { +      break; +    } +    sleep(&disk.free[0], &disk.vdisk_lock); +  } +   +  // format the three descriptors. +  // qemu's virtio-blk.c reads them. + +  struct virtio_blk_outhdr { +    uint32 type; +    uint32 reserved; +    uint64 sector; +  } buf0; + +  if(write) +    buf0.type = VIRTIO_BLK_T_OUT; // write the disk +  else +    buf0.type = VIRTIO_BLK_T_IN; // read the disk +  buf0.reserved = 0; +  buf0.sector = sector; + +  // buf0 is on a kernel stack, which is not direct mapped, +  // thus the call to kvmpa(). +  disk.desc[idx[0]].addr = (uint64) kvmpa((uint64) &buf0); +  disk.desc[idx[0]].len = sizeof(buf0); +  disk.desc[idx[0]].flags = VRING_DESC_F_NEXT; +  disk.desc[idx[0]].next = idx[1]; + +  disk.desc[idx[1]].addr = (uint64) b->data; +  disk.desc[idx[1]].len = BSIZE; +  if(write) +    disk.desc[idx[1]].flags = 0; // device reads b->data +  else +    disk.desc[idx[1]].flags = VRING_DESC_F_WRITE; // device writes b->data +  disk.desc[idx[1]].flags |= VRING_DESC_F_NEXT; +  disk.desc[idx[1]].next = idx[2]; + +  disk.info[idx[0]].status = 0; +  disk.desc[idx[2]].addr = (uint64) &disk.info[idx[0]].status; +  disk.desc[idx[2]].len = 1; +  disk.desc[idx[2]].flags = VRING_DESC_F_WRITE; // device writes the status +  disk.desc[idx[2]].next = 0; + +  // record struct buf for virtio_disk_intr(). +  b->disk = 1; +  disk.info[idx[0]].b = b; + +  // avail[0] is flags +  // avail[1] tells the device how far to look in avail[2...]. +  // avail[2...] are desc[] indices the device should process. +  // we only tell device the first index in our chain of descriptors. +  disk.avail[2 + (disk.avail[1] % NUM)] = idx[0]; +  __sync_synchronize(); +  disk.avail[1] = disk.avail[1] + 1; + +  *R(VIRTIO_MMIO_QUEUE_NOTIFY) = 0; // value is queue number + +  // Wait for virtio_disk_intr() to say request has finished. +  while(b->disk == 1) { +    sleep(b, &disk.vdisk_lock); +  } + +  disk.info[idx[0]].b = 0; +  free_chain(idx[0]); + +  release(&disk.vdisk_lock); +} + +void +virtio_disk_intr() +{ +  acquire(&disk.vdisk_lock); + +  while((disk.used_idx % NUM) != (disk.used->id % NUM)){ +    int id = disk.used->elems[disk.used_idx].id; + +    if(disk.info[id].status != 0) +      panic("virtio_disk_intr status"); +     +    disk.info[id].b->disk = 0;   // disk is done with buf +    wakeup(disk.info[id].b); + +    disk.used_idx = (disk.used_idx + 1) % NUM; +  } + +  release(&disk.vdisk_lock); +} diff --git a/kernel/vm.c b/kernel/vm.c new file mode 100644 index 0000000..3631c9c --- /dev/null +++ b/kernel/vm.c @@ -0,0 +1,441 @@ +#include "param.h" +#include "types.h" +#include "memlayout.h" +#include "elf.h" +#include "riscv.h" +#include "defs.h" +#include "fs.h" + +/* + * the kernel's page table. + */ +pagetable_t kernel_pagetable; + +extern char etext[];  // kernel.ld sets this to end of kernel code. + +extern char trampoline[]; // trampoline.S + +/* + * create a direct-map page table for the kernel and + * turn on paging. called early, in supervisor mode. + * the page allocator is already initialized. + */ +void +kvminit() +{ +  kernel_pagetable = (pagetable_t) kalloc(); +  memset(kernel_pagetable, 0, PGSIZE); + +  // uart registers +  kvmmap(UART0, UART0, PGSIZE, PTE_R | PTE_W); + +  // virtio mmio disk interface +  kvmmap(VIRTIO0, VIRTIO0, PGSIZE, PTE_R | PTE_W); + +  // CLINT +  kvmmap(CLINT, CLINT, 0x10000, PTE_R | PTE_W); + +  // PLIC +  kvmmap(PLIC, PLIC, 0x400000, PTE_R | PTE_W); + +  // map kernel text executable and read-only. +  kvmmap(KERNBASE, KERNBASE, (uint64)etext-KERNBASE, PTE_R | PTE_X); + +  // map kernel data and the physical RAM we'll make use of. +  kvmmap((uint64)etext, (uint64)etext, PHYSTOP-(uint64)etext, PTE_R | PTE_W); + +  // map the trampoline for trap entry/exit to +  // the highest virtual address in the kernel. +  kvmmap(TRAMPOLINE, (uint64)trampoline, PGSIZE, PTE_R | PTE_X); +} + +// Switch h/w page table register to the kernel's page table, +// and enable paging. +void +kvminithart() +{ +  sfence_vma(); +  w_satp(MAKE_SATP(kernel_pagetable)); +} + +// Return the address of the PTE in page table pagetable +// that corresponds to virtual address va.  If alloc!=0, +// create any required page-table pages. +// +// The risc-v Sv39 scheme has three levels of page-table +// pages. A page-table page contains 512 64-bit PTEs. +// A 64-bit virtual address is split into five fields: +//   39..63 -- must be zero. +//   30..38 -- 9 bits of level-2 index. +//   21..39 -- 9 bits of level-1 index. +//   12..20 -- 9 bits of level-0 index. +//    0..12 -- 12 bits of byte offset within the page. +static pte_t * +walk(pagetable_t pagetable, uint64 va, int alloc) +{ +  if(va >= MAXVA) +    panic("walk"); + +  for(int level = 2; level > 0; level--) { +    pte_t *pte = &pagetable[PX(level, va)]; +    if(*pte & PTE_V) { +      pagetable = (pagetable_t)PTE2PA(*pte); +    } else { +      if(!alloc || (pagetable = (pde_t*)kalloc()) == 0) +        return 0; +      memset(pagetable, 0, PGSIZE); +      *pte = PA2PTE(pagetable) | PTE_V; +    } +  } +  return &pagetable[PX(0, va)]; +} + +// Look up a virtual address, return the physical address, +// or 0 if not mapped. +// Can only be used to look up user pages. +uint64 +walkaddr(pagetable_t pagetable, uint64 va) +{ +  pte_t *pte; +  uint64 pa; + +  pte = walk(pagetable, va, 0); +  if(pte == 0) +    return 0; +  if((*pte & PTE_V) == 0) +    return 0; +  if((*pte & PTE_U) == 0) +    return 0; +  pa = PTE2PA(*pte); +  return pa; +} + +// add a mapping to the kernel page table. +// only used when booting. +// does not flush TLB or enable paging. +void +kvmmap(uint64 va, uint64 pa, uint64 sz, int perm) +{ +  if(mappages(kernel_pagetable, va, sz, pa, perm) != 0) +    panic("kvmmap"); +} + +// translate a kernel virtual address to +// a physical address. only needed for +// addresses on the stack. +// assumes va is page aligned. +uint64 +kvmpa(uint64 va) +{ +  uint64 off = va % PGSIZE; +  pte_t *pte; +  uint64 pa; +   +  pte = walk(kernel_pagetable, va, 0); +  if(pte == 0) +    panic("kvmpa"); +  if((*pte & PTE_V) == 0) +    panic("kvmpa"); +  pa = PTE2PA(*pte); +  return pa+off; +} + +// Create PTEs for virtual addresses starting at va that refer to +// physical addresses starting at pa. va and size might not +// be page-aligned. Returns 0 on success, -1 if walk() couldn't +// allocate a needed page-table page. +int +mappages(pagetable_t pagetable, uint64 va, uint64 size, uint64 pa, int perm) +{ +  uint64 a, last; +  pte_t *pte; + +  a = PGROUNDDOWN(va); +  last = PGROUNDDOWN(va + size - 1); +  for(;;){ +    if((pte = walk(pagetable, a, 1)) == 0) +      return -1; +    if(*pte & PTE_V) +      panic("remap"); +    *pte = PA2PTE(pa) | perm | PTE_V; +    if(a == last) +      break; +    a += PGSIZE; +    pa += PGSIZE; +  } +  return 0; +} + +// Remove mappings from a page table. The mappings in +// the given range must exist. Optionally free the +// physical memory. +void +uvmunmap(pagetable_t pagetable, uint64 va, uint64 size, int do_free) +{ +  uint64 a, last; +  pte_t *pte; +  uint64 pa; + +  a = PGROUNDDOWN(va); +  last = PGROUNDDOWN(va + size - 1); +  for(;;){ +    if((pte = walk(pagetable, a, 0)) == 0) +      panic("uvmunmap: walk"); +    if((*pte & PTE_V) == 0){ +      printf("va=%p pte=%p\n", a, *pte); +      panic("uvmunmap: not mapped"); +    } +    if(PTE_FLAGS(*pte) == PTE_V) +      panic("uvmunmap: not a leaf"); +    if(do_free){ +      pa = PTE2PA(*pte); +      kfree((void*)pa); +    } +    *pte = 0; +    if(a == last) +      break; +    a += PGSIZE; +    pa += PGSIZE; +  } +} + +// create an empty user page table. +pagetable_t +uvmcreate() +{ +  pagetable_t pagetable; +  pagetable = (pagetable_t) kalloc(); +  if(pagetable == 0) +    panic("uvmcreate: out of memory"); +  memset(pagetable, 0, PGSIZE); +  return pagetable; +} + +// Load the user initcode into address 0 of pagetable, +// for the very first process. +// sz must be less than a page. +void +uvminit(pagetable_t pagetable, uchar *src, uint sz) +{ +  char *mem; + +  if(sz >= PGSIZE) +    panic("inituvm: more than a page"); +  mem = kalloc(); +  memset(mem, 0, PGSIZE); +  mappages(pagetable, 0, PGSIZE, (uint64)mem, PTE_W|PTE_R|PTE_X|PTE_U); +  memmove(mem, src, sz); +} + +// Allocate PTEs and physical memory to grow process from oldsz to +// newsz, which need not be page aligned.  Returns new size or 0 on error. +uint64 +uvmalloc(pagetable_t pagetable, uint64 oldsz, uint64 newsz) +{ +  char *mem; +  uint64 a; + +  if(newsz < oldsz) +    return oldsz; + +  oldsz = PGROUNDUP(oldsz); +  a = oldsz; +  for(; a < newsz; a += PGSIZE){ +    mem = kalloc(); +    if(mem == 0){ +      uvmdealloc(pagetable, a, oldsz); +      return 0; +    } +    memset(mem, 0, PGSIZE); +    if(mappages(pagetable, a, PGSIZE, (uint64)mem, PTE_W|PTE_X|PTE_R|PTE_U) != 0){ +      kfree(mem); +      uvmdealloc(pagetable, a, oldsz); +      return 0; +    } +  } +  return newsz; +} + +// Deallocate user pages to bring the process size from oldsz to +// newsz.  oldsz and newsz need not be page-aligned, nor does newsz +// need to be less than oldsz.  oldsz can be larger than the actual +// process size.  Returns the new process size. +uint64 +uvmdealloc(pagetable_t pagetable, uint64 oldsz, uint64 newsz) +{ +  if(newsz >= oldsz) +    return oldsz; +  uvmunmap(pagetable, newsz, oldsz - newsz, 1); +  return newsz; +} + +// Recursively free page-table pages. +// All leaf mappings must already have been removed. +static void +freewalk(pagetable_t pagetable) +{ +  // there are 2^9 = 512 PTEs in a page table. +  for(int i = 0; i < 512; i++){ +    pte_t pte = pagetable[i]; +    if((pte & PTE_V) && (pte & (PTE_R|PTE_W|PTE_X)) == 0){ +      // this PTE points to a lower-level page table. +      uint64 child = PTE2PA(pte); +      freewalk((pagetable_t)child); +      pagetable[i] = 0; +    } else if(pte & PTE_V){ +      panic("freewalk: leaf"); +    } +  } +  kfree((void*)pagetable); +} + +// Free user memory pages, +// then free page-table pages. +void +uvmfree(pagetable_t pagetable, uint64 sz) +{ +  uvmunmap(pagetable, 0, sz, 1); +  freewalk(pagetable); +} + +// Given a parent process's page table, copy +// its memory into a child's page table. +// Copies both the page table and the +// physical memory. +// returns 0 on success, -1 on failure. +// frees any allocated pages on failure. +int +uvmcopy(pagetable_t old, pagetable_t new, uint64 sz) +{ +  pte_t *pte; +  uint64 pa, i; +  uint flags; +  char *mem; + +  for(i = 0; i < sz; i += PGSIZE){ +    if((pte = walk(old, i, 0)) == 0) +      panic("copyuvm: pte should exist"); +    if((*pte & PTE_V) == 0) +      panic("copyuvm: page not present"); +    pa = PTE2PA(*pte); +    flags = PTE_FLAGS(*pte); +    if((mem = kalloc()) == 0) +      goto err; +    memmove(mem, (char*)pa, PGSIZE); +    if(mappages(new, i, PGSIZE, (uint64)mem, flags) != 0){ +      kfree(mem); +      goto err; +    } +  } +  return 0; + + err: +  uvmunmap(new, 0, i, 1); +  return -1; +} + +// mark a PTE invalid for user access. +// used by exec for the user stack guard page. +void +uvmclear(pagetable_t pagetable, uint64 va) +{ +  pte_t *pte; +   +  pte = walk(pagetable, va, 0); +  if(pte == 0) +    panic("uvmclear"); +  *pte &= ~PTE_U; +} + +// Copy from kernel to user. +// Copy len bytes from src to virtual address dstva in a given page table. +// Return 0 on success, -1 on error. +int +copyout(pagetable_t pagetable, uint64 dstva, char *src, uint64 len) +{ +  uint64 n, va0, pa0; + +  while(len > 0){ +    va0 = (uint)PGROUNDDOWN(dstva); +    pa0 = walkaddr(pagetable, va0); +    if(pa0 == 0) +      return -1; +    n = PGSIZE - (dstva - va0); +    if(n > len) +      n = len; +    memmove((void *)(pa0 + (dstva - va0)), src, n); + +    len -= n; +    src += n; +    dstva = va0 + PGSIZE; +  } +  return 0; +} + +// Copy from user to kernel. +// Copy len bytes to dst from virtual address srcva in a given page table. +// Return 0 on success, -1 on error. +int +copyin(pagetable_t pagetable, char *dst, uint64 srcva, uint64 len) +{ +  uint64 n, va0, pa0; + +  while(len > 0){ +    va0 = (uint)PGROUNDDOWN(srcva); +    pa0 = walkaddr(pagetable, va0); +    if(pa0 == 0) +      return -1; +    n = PGSIZE - (srcva - va0); +    if(n > len) +      n = len; +    memmove(dst, (void *)(pa0 + (srcva - va0)), n); + +    len -= n; +    dst += n; +    srcva = va0 + PGSIZE; +  } +  return 0; +} + +// Copy a null-terminated string from user to kernel. +// Copy bytes to dst from virtual address srcva in a given page table, +// until a '\0', or max. +// Return 0 on success, -1 on error. +int +copyinstr(pagetable_t pagetable, char *dst, uint64 srcva, uint64 max) +{ +  uint64 n, va0, pa0; +  int got_null = 0; + +  while(got_null == 0 && max > 0){ +    va0 = (uint)PGROUNDDOWN(srcva); +    pa0 = walkaddr(pagetable, va0); +    if(pa0 == 0) +      return -1; +    n = PGSIZE - (srcva - va0); +    if(n > max) +      n = max; + +    char *p = (char *) (pa0 + (srcva - va0)); +    while(n > 0){ +      if(*p == '\0'){ +        *dst = '\0'; +        got_null = 1; +        break; +      } else { +        *dst = *p; +      } +      --n; +      --max; +      p++; +      dst++; +    } + +    srcva = va0 + PGSIZE; +  } +  if(got_null){ +    return 0; +  } else { +    return -1; +  } +} | 
