diff options
Diffstat (limited to 'kernel')
46 files changed, 6190 insertions, 0 deletions
diff --git a/kernel/bio.c b/kernel/bio.c new file mode 100644 index 0000000..a1074f2 --- /dev/null +++ b/kernel/bio.c @@ -0,0 +1,151 @@ +// Buffer cache. +// +// The buffer cache is a linked list of buf structures holding +// cached copies of disk block contents. Caching disk blocks +// in memory reduces the number of disk reads and also provides +// a synchronization point for disk blocks used by multiple processes. +// +// Interface: +// * To get a buffer for a particular disk block, call bread. +// * After changing buffer data, call bwrite to write it to disk. +// * When done with the buffer, call brelse. +// * Do not use the buffer after calling brelse. +// * Only one process at a time can use a buffer, +// so do not keep them longer than necessary. + + +#include "types.h" +#include "param.h" +#include "spinlock.h" +#include "sleeplock.h" +#include "riscv.h" +#include "defs.h" +#include "fs.h" +#include "buf.h" + +struct { + struct spinlock lock; + struct buf buf[NBUF]; + + // Linked list of all buffers, through prev/next. + // head.next is most recently used. + struct buf head; +} bcache; + +void +binit(void) +{ + struct buf *b; + + initlock(&bcache.lock, "bcache"); + + // Create linked list of buffers + bcache.head.prev = &bcache.head; + bcache.head.next = &bcache.head; + for(b = bcache.buf; b < bcache.buf+NBUF; b++){ + b->next = bcache.head.next; + b->prev = &bcache.head; + initsleeplock(&b->lock, "buffer"); + bcache.head.next->prev = b; + bcache.head.next = b; + } +} + +// Look through buffer cache for block on device dev. +// If not found, allocate a buffer. +// In either case, return locked buffer. +static struct buf* +bget(uint dev, uint blockno) +{ + struct buf *b; + + acquire(&bcache.lock); + + // Is the block already cached? + for(b = bcache.head.next; b != &bcache.head; b = b->next){ + if(b->dev == dev && b->blockno == blockno){ + b->refcnt++; + release(&bcache.lock); + acquiresleep(&b->lock); + return b; + } + } + + // Not cached; recycle an unused buffer. + for(b = bcache.head.prev; b != &bcache.head; b = b->prev){ + if(b->refcnt == 0) { + b->dev = dev; + b->blockno = blockno; + b->valid = 0; + b->refcnt = 1; + release(&bcache.lock); + acquiresleep(&b->lock); + return b; + } + } + panic("bget: no buffers"); +} + +// Return a locked buf with the contents of the indicated block. +struct buf* +bread(uint dev, uint blockno) +{ + struct buf *b; + + b = bget(dev, blockno); + if(!b->valid) { + virtio_disk_rw(b, 0); + b->valid = 1; + } + return b; +} + +// Write b's contents to disk. Must be locked. +void +bwrite(struct buf *b) +{ + if(!holdingsleep(&b->lock)) + panic("bwrite"); + virtio_disk_rw(b, 1); +} + +// Release a locked buffer. +// Move to the head of the MRU list. +void +brelse(struct buf *b) +{ + if(!holdingsleep(&b->lock)) + panic("brelse"); + + releasesleep(&b->lock); + + acquire(&bcache.lock); + b->refcnt--; + if (b->refcnt == 0) { + // no one is waiting for it. + b->next->prev = b->prev; + b->prev->next = b->next; + b->next = bcache.head.next; + b->prev = &bcache.head; + bcache.head.next->prev = b; + bcache.head.next = b; + } + + release(&bcache.lock); +} + +void +bpin(struct buf *b) { + acquire(&bcache.lock); + b->refcnt++; + release(&bcache.lock); +} + +void +bunpin(struct buf *b) { + acquire(&bcache.lock); + b->refcnt--; + release(&bcache.lock); +} + + diff --git a/kernel/buf.h b/kernel/buf.h new file mode 100644 index 0000000..4a3a39d --- /dev/null +++ b/kernel/buf.h @@ -0,0 +1,13 @@ +struct buf { + int valid; // has data been read from disk? + int disk; // does disk "own" buf? + uint dev; + uint blockno; + struct sleeplock lock; + uint refcnt; + struct buf *prev; // LRU cache list + struct buf *next; + struct buf *qnext; // disk queue + uchar data[BSIZE]; +}; + diff --git a/kernel/console.c b/kernel/console.c new file mode 100644 index 0000000..87a83ff --- /dev/null +++ b/kernel/console.c @@ -0,0 +1,199 @@ +// +// Console input and output, to the uart. +// Reads are line at a time. +// Implements special input characters: +// newline -- end of line +// control-h -- backspace +// control-u -- kill line +// control-d -- end of file +// control-p -- print process list +// + +#include <stdarg.h> + +#include "types.h" +#include "param.h" +#include "spinlock.h" +#include "sleeplock.h" +#include "fs.h" +#include "file.h" +#include "memlayout.h" +#include "riscv.h" +#include "defs.h" +#include "proc.h" + +#define BACKSPACE 0x100 +#define C(x) ((x)-'@') // Control-x + +// +// send one character to the uart. +// +void +consputc(int c) +{ + extern volatile int panicked; // from printf.c + + if(panicked){ + for(;;) + ; + } + + if(c == BACKSPACE){ + // if the user typed backspace, overwrite with a space. + uartputc('\b'); uartputc(' '); uartputc('\b'); + } else { + uartputc(c); + } +} + +struct { + struct spinlock lock; + + // input +#define INPUT_BUF 128 + char buf[INPUT_BUF]; + uint r; // Read index + uint w; // Write index + uint e; // Edit index +} cons; + +// +// user write()s to the console go here. +// +int +consolewrite(int user_src, uint64 src, int n) +{ + int i; + + acquire(&cons.lock); + for(i = 0; i < n; i++){ + char c; + if(either_copyin(&c, user_src, src+i, 1) == -1) + break; + consputc(c); + } + release(&cons.lock); + + return n; +} + +// +// user read()s from the console go here. +// copy (up to) a whole input line to dst. +// user_dist indicates whether dst is a user +// or kernel address. +// +int +consoleread(int user_dst, uint64 dst, int n) +{ + uint target; + int c; + char cbuf; + + target = n; + acquire(&cons.lock); + while(n > 0){ + // wait until interrupt handler has put some + // input into cons.buffer. + while(cons.r == cons.w){ + if(myproc()->killed){ + release(&cons.lock); + return -1; + } + sleep(&cons.r, &cons.lock); + } + + c = cons.buf[cons.r++ % INPUT_BUF]; + + if(c == C('D')){ // end-of-file + if(n < target){ + // Save ^D for next time, to make sure + // caller gets a 0-byte result. + cons.r--; + } + break; + } + + // copy the input byte to the user-space buffer. + cbuf = c; + if(either_copyout(user_dst, dst, &cbuf, 1) == -1) + break; + + dst++; + --n; + + if(c == '\n'){ + // a whole line has arrived, return to + // the user-level read(). + break; + } + } + release(&cons.lock); + + return target - n; +} + +// +// the console input interrupt handler. +// uartintr() calls this for input character. +// do erase/kill processing, append to cons.buf, +// wake up consoleread() if a whole line has arrived. +// +void +consoleintr(int c) +{ + acquire(&cons.lock); + + switch(c){ + case C('P'): // Print process list. + procdump(); + break; + case C('U'): // Kill line. + while(cons.e != cons.w && + cons.buf[(cons.e-1) % INPUT_BUF] != '\n'){ + cons.e--; + consputc(BACKSPACE); + } + break; + case C('H'): // Backspace + case '\x7f': + if(cons.e != cons.w){ + cons.e--; + consputc(BACKSPACE); + } + break; + default: + if(c != 0 && cons.e-cons.r < INPUT_BUF){ + c = (c == '\r') ? '\n' : c; + + // echo back to the user. + consputc(c); + + // store for consumption by consoleread(). + cons.buf[cons.e++ % INPUT_BUF] = c; + + if(c == '\n' || c == C('D') || cons.e == cons.r+INPUT_BUF){ + // wake up consoleread() if a whole line (or end-of-file) + // has arrived. + cons.w = cons.e; + wakeup(&cons.r); + } + } + break; + } + + release(&cons.lock); +} + +void +consoleinit(void) +{ + initlock(&cons.lock, "cons"); + + uartinit(); + + // connect read and write system calls + // to consoleread and consolewrite. + devsw[CONSOLE].read = consoleread; + devsw[CONSOLE].write = consolewrite; +} diff --git a/kernel/date.h b/kernel/date.h new file mode 100644 index 0000000..94aec4b --- /dev/null +++ b/kernel/date.h @@ -0,0 +1,8 @@ +struct rtcdate { + uint second; + uint minute; + uint hour; + uint day; + uint month; + uint year; +}; diff --git a/kernel/defs.h b/kernel/defs.h new file mode 100644 index 0000000..23dcd41 --- /dev/null +++ b/kernel/defs.h @@ -0,0 +1,186 @@ +struct buf; +struct context; +struct file; +struct inode; +struct pipe; +struct proc; +struct spinlock; +struct sleeplock; +struct stat; +struct superblock; + +// bio.c +void binit(void); +struct buf* bread(uint, uint); +void brelse(struct buf*); +void bwrite(struct buf*); +void bpin(struct buf*); +void bunpin(struct buf*); + +// console.c +void consoleinit(void); +void consoleintr(int); +void consputc(int); + +// exec.c +int exec(char*, char**); + +// file.c +struct file* filealloc(void); +void fileclose(struct file*); +struct file* filedup(struct file*); +void fileinit(void); +int fileread(struct file*, uint64, int n); +int filestat(struct file*, uint64 addr); +int filewrite(struct file*, uint64, int n); + +// fs.c +void fsinit(int); +int dirlink(struct inode*, char*, uint); +struct inode* dirlookup(struct inode*, char*, uint*); +struct inode* ialloc(uint, short); +struct inode* idup(struct inode*); +void iinit(); +void ilock(struct inode*); +void iput(struct inode*); +void iunlock(struct inode*); +void iunlockput(struct inode*); +void iupdate(struct inode*); +int namecmp(const char*, const char*); +struct inode* namei(char*); +struct inode* nameiparent(char*, char*); +int readi(struct inode*, int, uint64, uint, uint); +void stati(struct inode*, struct stat*); +int writei(struct inode*, int, uint64, uint, uint); + +// ramdisk.c +void ramdiskinit(void); +void ramdiskintr(void); +void ramdiskrw(struct buf*); + +// kalloc.c +void* kalloc(void); +void kfree(void *); +void kinit(); + +// log.c +void initlog(int, struct superblock*); +void log_write(struct buf*); +void begin_op(); +void end_op(); + +// pipe.c +int pipealloc(struct file**, struct file**); +void pipeclose(struct pipe*, int); +int piperead(struct pipe*, uint64, int); +int pipewrite(struct pipe*, uint64, int); + +// printf.c +void printf(char*, ...); +void panic(char*) __attribute__((noreturn)); +void printfinit(void); + +// proc.c +int cpuid(void); +void exit(void); +int fork(void); +int growproc(int); +pagetable_t proc_pagetable(struct proc *); +void proc_freepagetable(pagetable_t, uint64); +int kill(int); +struct cpu* mycpu(void); +struct cpu* getmycpu(void); +struct proc* myproc(); +void procinit(void); +void scheduler(void) __attribute__((noreturn)); +void sched(void); +void setproc(struct proc*); +void sleep(void*, struct spinlock*); +void userinit(void); +int wait(void); +void wakeup(void*); +void yield(void); +int either_copyout(int user_dst, uint64 dst, void *src, uint64 len); +int either_copyin(void *dst, int user_src, uint64 src, uint64 len); +void procdump(void); + +// swtch.S +void swtch(struct context*, struct context*); + +// spinlock.c +void acquire(struct spinlock*); +int holding(struct spinlock*); +void initlock(struct spinlock*, char*); +void release(struct spinlock*); +void push_off(void); +void pop_off(void); + +// sleeplock.c +void acquiresleep(struct sleeplock*); +void releasesleep(struct sleeplock*); +int holdingsleep(struct sleeplock*); +void initsleeplock(struct sleeplock*, char*); + +// string.c +int memcmp(const void*, const void*, uint); +void* memmove(void*, const void*, uint); +void* memset(void*, int, uint); +char* safestrcpy(char*, const char*, int); +int strlen(const char*); +int strncmp(const char*, const char*, uint); +char* strncpy(char*, const char*, int); + +// syscall.c +int argint(int, int*); +int argstr(int, char*, int); +int argaddr(int, uint64 *); +int fetchstr(uint64, char*, int); +int fetchaddr(uint64, uint64*); +void syscall(); + +// trap.c +extern uint ticks; +void trapinit(void); +void trapinithart(void); +extern struct spinlock tickslock; +void usertrapret(void); + +// uart.c +void uartinit(void); +void uartintr(void); +void uartputc(int); +int uartgetc(void); + +// vm.c +void kvminit(void); +void kvminithart(void); +uint64 kvmpa(uint64); +void kvmmap(uint64, uint64, uint64, int); +int mappages(pagetable_t, uint64, uint64, uint64, int); +pagetable_t uvmcreate(void); +void uvminit(pagetable_t, uchar *, uint); +uint64 uvmalloc(pagetable_t, uint64, uint64); +uint64 uvmdealloc(pagetable_t, uint64, uint64); +int uvmcopy(pagetable_t, pagetable_t, uint64); +void uvmfree(pagetable_t, uint64); +void uvmunmap(pagetable_t, uint64, uint64, int); +void uvmclear(pagetable_t, uint64); +uint64 walkaddr(pagetable_t, uint64); +int copyout(pagetable_t, uint64, char *, uint64); +int copyin(pagetable_t, char *, uint64, uint64); +int copyinstr(pagetable_t, char *, uint64, uint64); + +// plic.c +void plicinit(void); +void plicinithart(void); +uint64 plic_pending(void); +int plic_claim(void); +void plic_complete(int); + +// virtio_disk.c +void virtio_disk_init(void); +void virtio_disk_rw(struct buf *, int); +void virtio_disk_intr(); + +// number of elements in fixed-size array +#define NELEM(x) (sizeof(x)/sizeof((x)[0])) diff --git a/kernel/elf.h b/kernel/elf.h new file mode 100644 index 0000000..84555fa --- /dev/null +++ b/kernel/elf.h @@ -0,0 +1,42 @@ +// Format of an ELF executable file + +#define ELF_MAGIC 0x464C457FU // "\x7FELF" in little endian + +// File header +struct elfhdr { + uint magic; // must equal ELF_MAGIC + uchar elf[12]; + ushort type; + ushort machine; + uint version; + uint64 entry; + uint64 phoff; + uint64 shoff; + uint flags; + ushort ehsize; + ushort phentsize; + ushort phnum; + ushort shentsize; + ushort shnum; + ushort shstrndx; +}; + +// Program section header +struct proghdr { + uint32 type; + uint32 flags; + uint64 off; + uint64 vaddr; + uint64 paddr; + uint64 filesz; + uint64 memsz; + uint64 align; +}; + +// Values for Proghdr type +#define ELF_PROG_LOAD 1 + +// Flag bits for Proghdr flags +#define ELF_PROG_FLAG_EXEC 1 +#define ELF_PROG_FLAG_WRITE 2 +#define ELF_PROG_FLAG_READ 4 diff --git a/kernel/entry.S b/kernel/entry.S new file mode 100644 index 0000000..ef5a56a --- /dev/null +++ b/kernel/entry.S @@ -0,0 +1,26 @@ + # qemu -kernel starts at 0x1000. the instructions + # there seem to be provided by qemu, as if it + # were a ROM. the code at 0x1000 jumps to + # 0x8000000, the _start function here, + # in machine mode. each CPU starts here. +.section .data +.globl stack0 +.section .text +.globl start +.section .text +.globl _entry +_entry: + # set up a stack for C. + # stack0 is declared in start.c, + # with a 4096-byte stack per CPU. + # sp = stack0 + (hartid * 4096) + la sp, stack0 + li a0, 1024*4 + csrr a1, mhartid + addi a1, a1, 1 + mul a0, a0, a1 + add sp, sp, a0 + # jump to start() in start.c + call start +junk: + j junk diff --git a/kernel/exec.c b/kernel/exec.c new file mode 100644 index 0000000..74ef654 --- /dev/null +++ b/kernel/exec.c @@ -0,0 +1,153 @@ +#include "types.h" +#include "param.h" +#include "memlayout.h" +#include "riscv.h" +#include "spinlock.h" +#include "proc.h" +#include "defs.h" +#include "elf.h" + +static int loadseg(pde_t *pgdir, uint64 addr, struct inode *ip, uint offset, uint sz); + +int +exec(char *path, char **argv) +{ + char *s, *last; + int i, off; + uint64 argc, sz, sp, ustack[MAXARG+1], stackbase; + struct elfhdr elf; + struct inode *ip; + struct proghdr ph; + pagetable_t pagetable = 0, oldpagetable; + struct proc *p = myproc(); + + begin_op(); + + if((ip = namei(path)) == 0){ + end_op(); + return -1; + } + ilock(ip); + + // Check ELF header + if(readi(ip, 0, (uint64)&elf, 0, sizeof(elf)) != sizeof(elf)) + goto bad; + if(elf.magic != ELF_MAGIC) + goto bad; + + if((pagetable = proc_pagetable(p)) == 0) + goto bad; + + // Load program into memory. + sz = 0; + for(i=0, off=elf.phoff; i<elf.phnum; i++, off+=sizeof(ph)){ + if(readi(ip, 0, (uint64)&ph, off, sizeof(ph)) != sizeof(ph)) + goto bad; + if(ph.type != ELF_PROG_LOAD) + continue; + if(ph.memsz < ph.filesz) + goto bad; + if(ph.vaddr + ph.memsz < ph.vaddr) + goto bad; + if((sz = uvmalloc(pagetable, sz, ph.vaddr + ph.memsz)) == 0) + goto bad; + if(ph.vaddr % PGSIZE != 0) + goto bad; + if(loadseg(pagetable, ph.vaddr, ip, ph.off, ph.filesz) < 0) + goto bad; + } + iunlockput(ip); + end_op(); + ip = 0; + + p = myproc(); + uint64 oldsz = p->sz; + + // Allocate two pages at the next page boundary. + // Use the second as the user stack. + sz = PGROUNDUP(sz); + if((sz = uvmalloc(pagetable, sz, sz + 2*PGSIZE)) == 0) + goto bad; + uvmclear(pagetable, sz-2*PGSIZE); + sp = sz; + stackbase = sp - PGSIZE; + + // Push argument strings, prepare rest of stack in ustack. + for(argc = 0; argv[argc]; argc++) { + if(argc >= MAXARG) + goto bad; + sp -= strlen(argv[argc]) + 1; + sp -= sp % 16; // riscv sp must be 16-byte aligned + if(sp < stackbase) + goto bad; + if(copyout(pagetable, sp, argv[argc], strlen(argv[argc]) + 1) < 0) + goto bad; + ustack[argc] = sp; + } + ustack[argc] = 0; + + // push the array of argv[] pointers. + sp -= (argc+1) * sizeof(uint64); + sp -= sp % 16; + if(sp < stackbase) + goto bad; + if(copyout(pagetable, sp, (char *)ustack, (argc+1)*sizeof(uint64)) < 0) + goto bad; + + // arguments to user main(argc, argv) + // argc is returned via the system call return + // value, which goes in a0. + p->tf->a1 = sp; + + // Save program name for debugging. + for(last=s=path; *s; s++) + if(*s == '/') + last = s+1; + safestrcpy(p->name, last, sizeof(p->name)); + + // Commit to the user image. + oldpagetable = p->pagetable; + p->pagetable = pagetable; + p->sz = sz; + p->tf->epc = elf.entry; // initial program counter = main + p->tf->sp = sp; // initial stack pointer + proc_freepagetable(oldpagetable, oldsz); + return argc; // this ends up in a0, the first argument to main(argc, argv) + + bad: + if(pagetable) + proc_freepagetable(pagetable, sz); + if(ip){ + iunlockput(ip); + end_op(); + } + return -1; +} + +// Load a program segment into pagetable at virtual address va. +// va must be page-aligned +// and the pages from va to va+sz must already be mapped. +// Returns 0 on success, -1 on failure. +static int +loadseg(pagetable_t pagetable, uint64 va, struct inode *ip, uint offset, uint sz) +{ + uint i, n; + uint64 pa; + + if((va % PGSIZE) != 0) + panic("loadseg: va must be page aligned"); + + for(i = 0; i < sz; i += PGSIZE){ + pa = walkaddr(pagetable, va + i); + if(pa == 0) + panic("loadseg: address should exist"); + if(sz - i < PGSIZE) + n = sz - i; + else + n = PGSIZE; + if(readi(ip, 0, (uint64)pa, offset+i, n) != n) + return -1; + } + + return 0; +} diff --git a/kernel/fcntl.h b/kernel/fcntl.h new file mode 100644 index 0000000..d565483 --- /dev/null +++ b/kernel/fcntl.h @@ -0,0 +1,4 @@ +#define O_RDONLY 0x000 +#define O_WRONLY 0x001 +#define O_RDWR 0x002 +#define O_CREATE 0x200 diff --git a/kernel/file.c b/kernel/file.c new file mode 100644 index 0000000..7663476 --- /dev/null +++ b/kernel/file.c @@ -0,0 +1,178 @@ +// +// Support functions for system calls that involve file descriptors. +// + +#include "types.h" +#include "riscv.h" +#include "defs.h" +#include "param.h" +#include "fs.h" +#include "spinlock.h" +#include "sleeplock.h" +#include "file.h" +#include "stat.h" +#include "proc.h" + +struct devsw devsw[NDEV]; +struct { + struct spinlock lock; + struct file file[NFILE]; +} ftable; + +void +fileinit(void) +{ + initlock(&ftable.lock, "ftable"); +} + +// Allocate a file structure. +struct file* +filealloc(void) +{ + struct file *f; + + acquire(&ftable.lock); + for(f = ftable.file; f < ftable.file + NFILE; f++){ + if(f->ref == 0){ + f->ref = 1; + release(&ftable.lock); + return f; + } + } + release(&ftable.lock); + return 0; +} + +// Increment ref count for file f. +struct file* +filedup(struct file *f) +{ + acquire(&ftable.lock); + if(f->ref < 1) + panic("filedup"); + f->ref++; + release(&ftable.lock); + return f; +} + +// Close file f. (Decrement ref count, close when reaches 0.) +void +fileclose(struct file *f) +{ + struct file ff; + + acquire(&ftable.lock); + if(f->ref < 1) + panic("fileclose"); + if(--f->ref > 0){ + release(&ftable.lock); + return; + } + ff = *f; + f->ref = 0; + f->type = FD_NONE; + release(&ftable.lock); + + if(ff.type == FD_PIPE){ + pipeclose(ff.pipe, ff.writable); + } else if(ff.type == FD_INODE || ff.type == FD_DEVICE){ + begin_op(); + iput(ff.ip); + end_op(); + } +} + +// Get metadata about file f. +// addr is a user virtual address, pointing to a struct stat. +int +filestat(struct file *f, uint64 addr) +{ + struct proc *p = myproc(); + struct stat st; + + if(f->type == FD_INODE || f->type == FD_DEVICE){ + ilock(f->ip); + stati(f->ip, &st); + iunlock(f->ip); + if(copyout(p->pagetable, addr, (char *)&st, sizeof(st)) < 0) + return -1; + return 0; + } + return -1; +} + +// Read from file f. +// addr is a user virtual address. +int +fileread(struct file *f, uint64 addr, int n) +{ + int r = 0; + + if(f->readable == 0) + return -1; + + if(f->type == FD_PIPE){ + r = piperead(f->pipe, addr, n); + } else if(f->type == FD_DEVICE){ + r = devsw[f->major].read(1, addr, n); + } else if(f->type == FD_INODE){ + ilock(f->ip); + if((r = readi(f->ip, 1, addr, f->off, n)) > 0) + f->off += r; + iunlock(f->ip); + } else { + panic("fileread"); + } + + return r; +} + +// Write to file f. +// addr is a user virtual address. +int +filewrite(struct file *f, uint64 addr, int n) +{ + int r, ret = 0; + + if(f->writable == 0) + return -1; + + if(f->type == FD_PIPE){ + ret = pipewrite(f->pipe, addr, n); + } else if(f->type == FD_DEVICE){ + ret = devsw[f->major].write(1, addr, n); + } else if(f->type == FD_INODE){ + // write a few blocks at a time to avoid exceeding + // the maximum log transaction size, including + // i-node, indirect block, allocation blocks, + // and 2 blocks of slop for non-aligned writes. + // this really belongs lower down, since writei() + // might be writing a device like the console. + int max = ((MAXOPBLOCKS-1-1-2) / 2) * 512; + int i = 0; + while(i < n){ + int n1 = n - i; + if(n1 > max) + n1 = max; + + begin_op(); + ilock(f->ip); + if ((r = writei(f->ip, 1, addr + i, f->off, n1)) > 0) + f->off += r; + iunlock(f->ip); + end_op(); + + if(r < 0) + break; + if(r != n1) + panic("short filewrite"); + i += r; + } + ret = (i == n ? n : -1); + } else { + panic("filewrite"); + } + + return ret; +} + diff --git a/kernel/file.h b/kernel/file.h new file mode 100644 index 0000000..5cf15a2 --- /dev/null +++ b/kernel/file.h @@ -0,0 +1,37 @@ +struct file { + enum { FD_NONE, FD_PIPE, FD_INODE, FD_DEVICE } type; + int ref; // reference count + char readable; + char writable; + struct pipe *pipe; // FD_PIPE + struct inode *ip; // FD_INODE and FD_DEVICE + uint off; // FD_INODE + short major; // FD_DEVICE +}; + + +// in-memory copy of an inode +struct inode { + uint dev; // Device number + uint inum; // Inode number + int ref; // Reference count + struct sleeplock lock; // protects everything below here + int valid; // inode has been read from disk? + + short type; // copy of disk inode + short major; + short minor; + short nlink; + uint size; + uint addrs[NDIRECT+1]; +}; + +// map major device number to device functions. +struct devsw { + int (*read)(int, uint64, int); + int (*write)(int, uint64, int); +}; + +extern struct devsw devsw[]; + +#define CONSOLE 1 diff --git a/kernel/fs.c b/kernel/fs.c new file mode 100644 index 0000000..454d52b --- /dev/null +++ b/kernel/fs.c @@ -0,0 +1,666 @@ +// File system implementation. Five layers: +// + Blocks: allocator for raw disk blocks. +// + Log: crash recovery for multi-step updates. +// + Files: inode allocator, reading, writing, metadata. +// + Directories: inode with special contents (list of other inodes!) +// + Names: paths like /usr/rtm/xv6/fs.c for convenient naming. +// +// This file contains the low-level file system manipulation +// routines. The (higher-level) system call implementations +// are in sysfile.c. + +#include "types.h" +#include "riscv.h" +#include "defs.h" +#include "param.h" +#include "stat.h" +#include "spinlock.h" +#include "proc.h" +#include "sleeplock.h" +#include "fs.h" +#include "buf.h" +#include "file.h" + +#define min(a, b) ((a) < (b) ? (a) : (b)) +static void itrunc(struct inode*); +// there should be one superblock per disk device, but we run with +// only one device +struct superblock sb; + +// Read the super block. +static void +readsb(int dev, struct superblock *sb) +{ + struct buf *bp; + + bp = bread(dev, 1); + memmove(sb, bp->data, sizeof(*sb)); + brelse(bp); +} + +// Init fs +void +fsinit(int dev) { + readsb(dev, &sb); + if(sb.magic != FSMAGIC) + panic("invalid file system"); + initlog(dev, &sb); +} + +// Zero a block. +static void +bzero(int dev, int bno) +{ + struct buf *bp; + + bp = bread(dev, bno); + memset(bp->data, 0, BSIZE); + log_write(bp); + brelse(bp); +} + +// Blocks. + +// Allocate a zeroed disk block. +static uint +balloc(uint dev) +{ + int b, bi, m; + struct buf *bp; + + bp = 0; + for(b = 0; b < sb.size; b += BPB){ + bp = bread(dev, BBLOCK(b, sb)); + for(bi = 0; bi < BPB && b + bi < sb.size; bi++){ + m = 1 << (bi % 8); + if((bp->data[bi/8] & m) == 0){ // Is block free? + bp->data[bi/8] |= m; // Mark block in use. + log_write(bp); + brelse(bp); + bzero(dev, b + bi); + return b + bi; + } + } + brelse(bp); + } + panic("balloc: out of blocks"); +} + +// Free a disk block. +static void +bfree(int dev, uint b) +{ + struct buf *bp; + int bi, m; + + bp = bread(dev, BBLOCK(b, sb)); + bi = b % BPB; + m = 1 << (bi % 8); + if((bp->data[bi/8] & m) == 0) + panic("freeing free block"); + bp->data[bi/8] &= ~m; + log_write(bp); + brelse(bp); +} + +// Inodes. +// +// An inode describes a single unnamed file. +// The inode disk structure holds metadata: the file's type, +// its size, the number of links referring to it, and the +// list of blocks holding the file's content. +// +// The inodes are laid out sequentially on disk at +// sb.startinode. Each inode has a number, indicating its +// position on the disk. +// +// The kernel keeps a cache of in-use inodes in memory +// to provide a place for synchronizing access +// to inodes used by multiple processes. The cached +// inodes include book-keeping information that is +// not stored on disk: ip->ref and ip->valid. +// +// An inode and its in-memory representation go through a +// sequence of states before they can be used by the +// rest of the file system code. +// +// * Allocation: an inode is allocated if its type (on disk) +// is non-zero. ialloc() allocates, and iput() frees if +// the reference and link counts have fallen to zero. +// +// * Referencing in cache: an entry in the inode cache +// is free if ip->ref is zero. Otherwise ip->ref tracks +// the number of in-memory pointers to the entry (open +// files and current directories). iget() finds or +// creates a cache entry and increments its ref; iput() +// decrements ref. +// +// * Valid: the information (type, size, &c) in an inode +// cache entry is only correct when ip->valid is 1. +// ilock() reads the inode from +// the disk and sets ip->valid, while iput() clears +// ip->valid if ip->ref has fallen to zero. +// +// * Locked: file system code may only examine and modify +// the information in an inode and its content if it +// has first locked the inode. +// +// Thus a typical sequence is: +// ip = iget(dev, inum) +// ilock(ip) +// ... examine and modify ip->xxx ... +// iunlock(ip) +// iput(ip) +// +// ilock() is separate from iget() so that system calls can +// get a long-term reference to an inode (as for an open file) +// and only lock it for short periods (e.g., in read()). +// The separation also helps avoid deadlock and races during +// pathname lookup. iget() increments ip->ref so that the inode +// stays cached and pointers to it remain valid. +// +// Many internal file system functions expect the caller to +// have locked the inodes involved; this lets callers create +// multi-step atomic operations. +// +// The icache.lock spin-lock protects the allocation of icache +// entries. Since ip->ref indicates whether an entry is free, +// and ip->dev and ip->inum indicate which i-node an entry +// holds, one must hold icache.lock while using any of those fields. +// +// An ip->lock sleep-lock protects all ip-> fields other than ref, +// dev, and inum. One must hold ip->lock in order to +// read or write that inode's ip->valid, ip->size, ip->type, &c. + +struct { + struct spinlock lock; + struct inode inode[NINODE]; +} icache; + +void +iinit() +{ + int i = 0; + + initlock(&icache.lock, "icache"); + for(i = 0; i < NINODE; i++) { + initsleeplock(&icache.inode[i].lock, "inode"); + } +} + +static struct inode* iget(uint dev, uint inum); + +// Allocate an inode on device dev. +// Mark it as allocated by giving it type type. +// Returns an unlocked but allocated and referenced inode. +struct inode* +ialloc(uint dev, short type) +{ + int inum; + struct buf *bp; + struct dinode *dip; + + for(inum = 1; inum < sb.ninodes; inum++){ + bp = bread(dev, IBLOCK(inum, sb)); + dip = (struct dinode*)bp->data + inum%IPB; + if(dip->type == 0){ // a free inode + memset(dip, 0, sizeof(*dip)); + dip->type = type; + log_write(bp); // mark it allocated on the disk + brelse(bp); + return iget(dev, inum); + } + brelse(bp); + } + panic("ialloc: no inodes"); +} + +// Copy a modified in-memory inode to disk. +// Must be called after every change to an ip->xxx field +// that lives on disk, since i-node cache is write-through. +// Caller must hold ip->lock. +void +iupdate(struct inode *ip) +{ + struct buf *bp; + struct dinode *dip; + + bp = bread(ip->dev, IBLOCK(ip->inum, sb)); + dip = (struct dinode*)bp->data + ip->inum%IPB; + dip->type = ip->type; + dip->major = ip->major; + dip->minor = ip->minor; + dip->nlink = ip->nlink; + dip->size = ip->size; + memmove(dip->addrs, ip->addrs, sizeof(ip->addrs)); + log_write(bp); + brelse(bp); +} + +// Find the inode with number inum on device dev +// and return the in-memory copy. Does not lock +// the inode and does not read it from disk. +static struct inode* +iget(uint dev, uint inum) +{ + struct inode *ip, *empty; + + acquire(&icache.lock); + + // Is the inode already cached? + empty = 0; + for(ip = &icache.inode[0]; ip < &icache.inode[NINODE]; ip++){ + if(ip->ref > 0 && ip->dev == dev && ip->inum == inum){ + ip->ref++; + release(&icache.lock); + return ip; + } + if(empty == 0 && ip->ref == 0) // Remember empty slot. + empty = ip; + } + + // Recycle an inode cache entry. + if(empty == 0) + panic("iget: no inodes"); + + ip = empty; + ip->dev = dev; + ip->inum = inum; + ip->ref = 1; + ip->valid = 0; + release(&icache.lock); + + return ip; +} + +// Increment reference count for ip. +// Returns ip to enable ip = idup(ip1) idiom. +struct inode* +idup(struct inode *ip) +{ + acquire(&icache.lock); + ip->ref++; + release(&icache.lock); + return ip; +} + +// Lock the given inode. +// Reads the inode from disk if necessary. +void +ilock(struct inode *ip) +{ + struct buf *bp; + struct dinode *dip; + + if(ip == 0 || ip->ref < 1) + panic("ilock"); + + acquiresleep(&ip->lock); + + if(ip->valid == 0){ + bp = bread(ip->dev, IBLOCK(ip->inum, sb)); + dip = (struct dinode*)bp->data + ip->inum%IPB; + ip->type = dip->type; + ip->major = dip->major; + ip->minor = dip->minor; + ip->nlink = dip->nlink; + ip->size = dip->size; + memmove(ip->addrs, dip->addrs, sizeof(ip->addrs)); + brelse(bp); + ip->valid = 1; + if(ip->type == 0) + panic("ilock: no type"); + } +} + +// Unlock the given inode. +void +iunlock(struct inode *ip) +{ + if(ip == 0 || !holdingsleep(&ip->lock) || ip->ref < 1) + panic("iunlock"); + + releasesleep(&ip->lock); +} + +// Drop a reference to an in-memory inode. +// If that was the last reference, the inode cache entry can +// be recycled. +// If that was the last reference and the inode has no links +// to it, free the inode (and its content) on disk. +// All calls to iput() must be inside a transaction in +// case it has to free the inode. +void +iput(struct inode *ip) +{ + acquire(&icache.lock); + + if(ip->ref == 1 && ip->valid && ip->nlink == 0){ + // inode has no links and no other references: truncate and free. + + // ip->ref == 1 means no other process can have ip locked, + // so this acquiresleep() won't block (or deadlock). + acquiresleep(&ip->lock); + + release(&icache.lock); + + itrunc(ip); + ip->type = 0; + iupdate(ip); + ip->valid = 0; + + releasesleep(&ip->lock); + + acquire(&icache.lock); + } + + ip->ref--; + release(&icache.lock); +} + +// Common idiom: unlock, then put. +void +iunlockput(struct inode *ip) +{ + iunlock(ip); + iput(ip); +} + +// Inode content +// +// The content (data) associated with each inode is stored +// in blocks on the disk. The first NDIRECT block numbers +// are listed in ip->addrs[]. The next NINDIRECT blocks are +// listed in block ip->addrs[NDIRECT]. + +// Return the disk block address of the nth block in inode ip. +// If there is no such block, bmap allocates one. +static uint +bmap(struct inode *ip, uint bn) +{ + uint addr, *a; + struct buf *bp; + + if(bn < NDIRECT){ + if((addr = ip->addrs[bn]) == 0) + ip->addrs[bn] = addr = balloc(ip->dev); + return addr; + } + bn -= NDIRECT; + + if(bn < NINDIRECT){ + // Load indirect block, allocating if necessary. + if((addr = ip->addrs[NDIRECT]) == 0) + ip->addrs[NDIRECT] = addr = balloc(ip->dev); + bp = bread(ip->dev, addr); + a = (uint*)bp->data; + if((addr = a[bn]) == 0){ + a[bn] = addr = balloc(ip->dev); + log_write(bp); + } + brelse(bp); + return addr; + } + + panic("bmap: out of range"); +} + +// Truncate inode (discard contents). +// Only called when the inode has no links +// to it (no directory entries referring to it) +// and has no in-memory reference to it (is +// not an open file or current directory). +static void +itrunc(struct inode *ip) +{ + int i, j; + struct buf *bp; + uint *a; + + for(i = 0; i < NDIRECT; i++){ + if(ip->addrs[i]){ + bfree(ip->dev, ip->addrs[i]); + ip->addrs[i] = 0; + } + } + + if(ip->addrs[NDIRECT]){ + bp = bread(ip->dev, ip->addrs[NDIRECT]); + a = (uint*)bp->data; + for(j = 0; j < NINDIRECT; j++){ + if(a[j]) + bfree(ip->dev, a[j]); + } + brelse(bp); + bfree(ip->dev, ip->addrs[NDIRECT]); + ip->addrs[NDIRECT] = 0; + } + + ip->size = 0; + iupdate(ip); +} + +// Copy stat information from inode. +// Caller must hold ip->lock. +void +stati(struct inode *ip, struct stat *st) +{ + st->dev = ip->dev; + st->ino = ip->inum; + st->type = ip->type; + st->nlink = ip->nlink; + st->size = ip->size; +} + +// Read data from inode. +// Caller must hold ip->lock. +// If user_dst==1, then dst is a user virtual address; +// otherwise, dst is a kernel address. +int +readi(struct inode *ip, int user_dst, uint64 dst, uint off, uint n) +{ + uint tot, m; + struct buf *bp; + + if(off > ip->size || off + n < off) + return -1; + if(off + n > ip->size) + n = ip->size - off; + + for(tot=0; tot<n; tot+=m, off+=m, dst+=m){ + bp = bread(ip->dev, bmap(ip, off/BSIZE)); + m = min(n - tot, BSIZE - off%BSIZE); + if(either_copyout(user_dst, dst, bp->data + (off % BSIZE), m) == -1) + break; + brelse(bp); + } + return n; +} + +// Write data to inode. +// Caller must hold ip->lock. +// If user_src==1, then src is a user virtual address; +// otherwise, src is a kernel address. +int +writei(struct inode *ip, int user_src, uint64 src, uint off, uint n) +{ + uint tot, m; + struct buf *bp; + + if(off > ip->size || off + n < off) + return -1; + if(off + n > MAXFILE*BSIZE) + return -1; + + for(tot=0; tot<n; tot+=m, off+=m, src+=m){ + bp = bread(ip->dev, bmap(ip, off/BSIZE)); + m = min(n - tot, BSIZE - off%BSIZE); + if(either_copyin(bp->data + (off % BSIZE), user_src, src, m) == -1) + break; + log_write(bp); + brelse(bp); + } + + if(n > 0 && off > ip->size){ + ip->size = off; + iupdate(ip); + } + return n; +} + +// Directories + +int +namecmp(const char *s, const char *t) +{ + return strncmp(s, t, DIRSIZ); +} + +// Look for a directory entry in a directory. +// If found, set *poff to byte offset of entry. +struct inode* +dirlookup(struct inode *dp, char *name, uint *poff) +{ + uint off, inum; + struct dirent de; + + if(dp->type != T_DIR) + panic("dirlookup not DIR"); + + for(off = 0; off < dp->size; off += sizeof(de)){ + if(readi(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de)) + panic("dirlookup read"); + if(de.inum == 0) + continue; + if(namecmp(name, de.name) == 0){ + // entry matches path element + if(poff) + *poff = off; + inum = de.inum; + return iget(dp->dev, inum); + } + } + + return 0; +} + +// Write a new directory entry (name, inum) into the directory dp. +int +dirlink(struct inode *dp, char *name, uint inum) +{ + int off; + struct dirent de; + struct inode *ip; + + // Check that name is not present. + if((ip = dirlookup(dp, name, 0)) != 0){ + iput(ip); + return -1; + } + + // Look for an empty dirent. + for(off = 0; off < dp->size; off += sizeof(de)){ + if(readi(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de)) + panic("dirlink read"); + if(de.inum == 0) + break; + } + + strncpy(de.name, name, DIRSIZ); + de.inum = inum; + if(writei(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de)) + panic("dirlink"); + + return 0; +} + +// Paths + +// Copy the next path element from path into name. +// Return a pointer to the element following the copied one. +// The returned path has no leading slashes, +// so the caller can check *path=='\0' to see if the name is the last one. +// If no name to remove, return 0. +// +// Examples: +// skipelem("a/bb/c", name) = "bb/c", setting name = "a" +// skipelem("///a//bb", name) = "bb", setting name = "a" +// skipelem("a", name) = "", setting name = "a" +// skipelem("", name) = skipelem("////", name) = 0 +// +static char* +skipelem(char *path, char *name) +{ + char *s; + int len; + + while(*path == '/') + path++; + if(*path == 0) + return 0; + s = path; + while(*path != '/' && *path != 0) + path++; + len = path - s; + if(len >= DIRSIZ) + memmove(name, s, DIRSIZ); + else { + memmove(name, s, len); + name[len] = 0; + } + while(*path == '/') + path++; + return path; +} + +// Look up and return the inode for a path name. +// If parent != 0, return the inode for the parent and copy the final +// path element into name, which must have room for DIRSIZ bytes. +// Must be called inside a transaction since it calls iput(). +static struct inode* +namex(char *path, int nameiparent, char *name) +{ + struct inode *ip, *next; + + if(*path == '/') + ip = iget(ROOTDEV, ROOTINO); + else + ip = idup(myproc()->cwd); + + while((path = skipelem(path, name)) != 0){ + ilock(ip); + if(ip->type != T_DIR){ + iunlockput(ip); + return 0; + } + if(nameiparent && *path == '\0'){ + // Stop one level early. + iunlock(ip); + return ip; + } + if((next = dirlookup(ip, name, 0)) == 0){ + iunlockput(ip); + return 0; + } + iunlockput(ip); + ip = next; + } + if(nameiparent){ + iput(ip); + return 0; + } + return ip; +} + +struct inode* +namei(char *path) +{ + char name[DIRSIZ]; + return namex(path, 0, name); +} + +struct inode* +nameiparent(char *path, char *name) +{ + return namex(path, 1, name); +} diff --git a/kernel/fs.h b/kernel/fs.h new file mode 100644 index 0000000..139dcc9 --- /dev/null +++ b/kernel/fs.h @@ -0,0 +1,60 @@ +// On-disk file system format. +// Both the kernel and user programs use this header file. + + +#define ROOTINO 1 // root i-number +#define BSIZE 1024 // block size + +// Disk layout: +// [ boot block | super block | log | inode blocks | +// free bit map | data blocks] +// +// mkfs computes the super block and builds an initial file system. The +// super block describes the disk layout: +struct superblock { + uint magic; // Must be FSMAGIC + uint size; // Size of file system image (blocks) + uint nblocks; // Number of data blocks + uint ninodes; // Number of inodes. + uint nlog; // Number of log blocks + uint logstart; // Block number of first log block + uint inodestart; // Block number of first inode block + uint bmapstart; // Block number of first free map block +}; + +#define FSMAGIC 0x10203040 + +#define NDIRECT 12 +#define NINDIRECT (BSIZE / sizeof(uint)) +#define MAXFILE (NDIRECT + NINDIRECT) + +// On-disk inode structure +struct dinode { + short type; // File type + short major; // Major device number (T_DEVICE only) + short minor; // Minor device number (T_DEVICE only) + short nlink; // Number of links to inode in file system + uint size; // Size of file (bytes) + uint addrs[NDIRECT+1]; // Data block addresses +}; + +// Inodes per block. +#define IPB (BSIZE / sizeof(struct dinode)) + +// Block containing inode i +#define IBLOCK(i, sb) ((i) / IPB + sb.inodestart) + +// Bitmap bits per block +#define BPB (BSIZE*8) + +// Block of free map containing bit for block b +#define BBLOCK(b, sb) ((b)/BPB + sb.bmapstart) + +// Directory is a file containing a sequence of dirent structures. +#define DIRSIZ 14 + +struct dirent { + ushort inum; + char name[DIRSIZ]; +}; + diff --git a/kernel/kalloc.c b/kernel/kalloc.c new file mode 100644 index 0000000..ae3863b --- /dev/null +++ b/kernel/kalloc.c @@ -0,0 +1,83 @@ +// Physical memory allocator, for user processes, +// kernel stacks, page-table pages, +// and pipe buffers. Allocates whole 4096-byte pages. + +#include "types.h" +#include "param.h" +#include "memlayout.h" +#include "spinlock.h" +#include "riscv.h" +#include "defs.h" + +void freerange(void *pa_start, void *pa_end); + +extern char end[]; // first address after kernel. + // defined by kernel.ld. + +struct run { + struct run *next; +}; + +struct { + struct spinlock lock; + struct run *freelist; +} kmem; + +void +kinit() +{ + initlock(&kmem.lock, "kmem"); + freerange(end, (void*)PHYSTOP); +} + +void +freerange(void *pa_start, void *pa_end) +{ + char *p; + p = (char*)PGROUNDUP((uint64)pa_start); + p += 4096; // XXX I can't get kernel.ld to place end beyond the last bss symbol. + for(; p + PGSIZE <= (char*)pa_end; p += PGSIZE) + kfree(p); +} + +// Free the page of physical memory pointed at by v, +// which normally should have been returned by a +// call to kalloc(). (The exception is when +// initializing the allocator; see kinit above.) +void +kfree(void *pa) +{ + struct run *r; + + if(((uint64)pa % PGSIZE) != 0 || (char*)pa < end || (uint64)pa >= PHYSTOP) + panic("kfree"); + + // Fill with junk to catch dangling refs. + memset(pa, 1, PGSIZE); + + r = (struct run*)pa; + + acquire(&kmem.lock); + r->next = kmem.freelist; + kmem.freelist = r; + release(&kmem.lock); +} + +// Allocate one 4096-byte page of physical memory. +// Returns a pointer that the kernel can use. +// Returns 0 if the memory cannot be allocated. +void * +kalloc(void) +{ + struct run *r; + + acquire(&kmem.lock); + r = kmem.freelist; + if(r) + kmem.freelist = r->next; + release(&kmem.lock); + + if(r) + memset((char*)r, 5, PGSIZE); // fill with junk + return (void*)r; +} diff --git a/kernel/kernel.ld b/kernel/kernel.ld new file mode 100644 index 0000000..0b5e76b --- /dev/null +++ b/kernel/kernel.ld @@ -0,0 +1,32 @@ +OUTPUT_ARCH( "riscv" ) +ENTRY( _entry ) + +SECTIONS +{ + /* + * ensure that entry.S / _entry is at 0x80000000, + * where qemu's -kernel jumps. + */ + . = 0x80000000; + .text : + { + *(.text) + . = ALIGN(0x1000); + *(trampsec) + } + + . = ALIGN(0x1000); + PROVIDE(etext = .); + + /* + * make sure end is after data and bss. + */ + .data : { + *(.data) + } + bss : { + *(.bss) + PROVIDE(end = .); + } + +} diff --git a/kernel/kernelvec.S b/kernel/kernelvec.S new file mode 100644 index 0000000..3e9d3e9 --- /dev/null +++ b/kernel/kernelvec.S @@ -0,0 +1,121 @@ + # + # interrupts and exceptions while in supervisor + # mode come here. + # + # push all registers, call kerneltrap(), restore, return. + # +.globl kerneltrap +.globl kernelvec +.align 4 +kernelvec: + // make room to save registers. + addi sp, sp, -256 + + // save the registers. + sd ra, 0(sp) + sd sp, 8(sp) + sd gp, 16(sp) + sd tp, 24(sp) + sd t0, 32(sp) + sd t1, 40(sp) + sd t2, 48(sp) + sd s0, 56(sp) + sd s1, 64(sp) + sd a0, 72(sp) + sd a1, 80(sp) + sd a2, 88(sp) + sd a3, 96(sp) + sd a4, 104(sp) + sd a5, 112(sp) + sd a6, 120(sp) + sd a7, 128(sp) + sd s2, 136(sp) + sd s3, 144(sp) + sd s4, 152(sp) + sd s5, 160(sp) + sd s6, 168(sp) + sd s7, 176(sp) + sd s8, 184(sp) + sd s9, 192(sp) + sd s10, 200(sp) + sd s11, 208(sp) + sd t3, 216(sp) + sd t4, 224(sp) + sd t5, 232(sp) + sd t6, 240(sp) + + // call the C trap handler in trap.c + call kerneltrap + + // restore registers. + ld ra, 0(sp) + ld sp, 8(sp) + ld gp, 16(sp) + // not this, in case we moved CPUs: ld tp, 24(sp) + ld t0, 32(sp) + ld t1, 40(sp) + ld t2, 48(sp) + ld s0, 56(sp) + ld s1, 64(sp) + ld a0, 72(sp) + ld a1, 80(sp) + ld a2, 88(sp) + ld a3, 96(sp) + ld a4, 104(sp) + ld a5, 112(sp) + ld a6, 120(sp) + ld a7, 128(sp) + ld s2, 136(sp) + ld s3, 144(sp) + ld s4, 152(sp) + ld s5, 160(sp) + ld s6, 168(sp) + ld s7, 176(sp) + ld s8, 184(sp) + ld s9, 192(sp) + ld s10, 200(sp) + ld s11, 208(sp) + ld t3, 216(sp) + ld t4, 224(sp) + ld t5, 232(sp) + ld t6, 240(sp) + + addi sp, sp, 256 + + // return to whatever we were doing in the kernel. + sret + + # + # machine-mode timer interrupt. + # +.globl timervec +.align 4 +timervec: + # start.c has set up the memory that mscratch points to: + # scratch[0,8,16] : register save area. + # scratch[32] : address of CLINT's MTIMECMP register. + # scratch[40] : desired interval between interrupts. + + csrrw a0, mscratch, a0 + sd a1, 0(a0) + sd a2, 8(a0) + sd a3, 16(a0) + + # schedule the next timer interrupt + # by adding interval to mtimecmp. + ld a1, 32(a0) # CLINT_MTIMECMP(hart) + ld a2, 40(a0) # interval + ld a3, 0(a1) + add a3, a3, a2 + sd a3, 0(a1) + + # raise a supervisor software interrupt. + li a1, 2 + csrw sip, a1 + + ld a3, 16(a0) + ld a2, 8(a0) + ld a1, 0(a0) + csrrw a0, mscratch, a0 + + mret diff --git a/kernel/log.c b/kernel/log.c new file mode 100644 index 0000000..5e884bb --- /dev/null +++ b/kernel/log.c @@ -0,0 +1,235 @@ +#include "types.h" +#include "riscv.h" +#include "defs.h" +#include "param.h" +#include "spinlock.h" +#include "sleeplock.h" +#include "fs.h" +#include "buf.h" + +// Simple logging that allows concurrent FS system calls. +// +// A log transaction contains the updates of multiple FS system +// calls. The logging system only commits when there are +// no FS system calls active. Thus there is never +// any reasoning required about whether a commit might +// write an uncommitted system call's updates to disk. +// +// A system call should call begin_op()/end_op() to mark +// its start and end. Usually begin_op() just increments +// the count of in-progress FS system calls and returns. +// But if it thinks the log is close to running out, it +// sleeps until the last outstanding end_op() commits. +// +// The log is a physical re-do log containing disk blocks. +// The on-disk log format: +// header block, containing block #s for block A, B, C, ... +// block A +// block B +// block C +// ... +// Log appends are synchronous. + +// Contents of the header block, used for both the on-disk header block +// and to keep track in memory of logged block# before commit. +struct logheader { + int n; + int block[LOGSIZE]; +}; + +struct log { + struct spinlock lock; + int start; + int size; + int outstanding; // how many FS sys calls are executing. + int committing; // in commit(), please wait. + int dev; + struct logheader lh; +}; +struct log log; + +static void recover_from_log(void); +static void commit(); + +void +initlog(int dev, struct superblock *sb) +{ + if (sizeof(struct logheader) >= BSIZE) + panic("initlog: too big logheader"); + + initlock(&log.lock, "log"); + log.start = sb->logstart; + log.size = sb->nlog; + log.dev = dev; + recover_from_log(); +} + +// Copy committed blocks from log to their home location +static void +install_trans(void) +{ + int tail; + + for (tail = 0; tail < log.lh.n; tail++) { + struct buf *lbuf = bread(log.dev, log.start+tail+1); // read log block + struct buf *dbuf = bread(log.dev, log.lh.block[tail]); // read dst + memmove(dbuf->data, lbuf->data, BSIZE); // copy block to dst + bwrite(dbuf); // write dst to disk + bunpin(dbuf); + brelse(lbuf); + brelse(dbuf); + } +} + +// Read the log header from disk into the in-memory log header +static void +read_head(void) +{ + struct buf *buf = bread(log.dev, log.start); + struct logheader *lh = (struct logheader *) (buf->data); + int i; + log.lh.n = lh->n; + for (i = 0; i < log.lh.n; i++) { + log.lh.block[i] = lh->block[i]; + } + brelse(buf); +} + +// Write in-memory log header to disk. +// This is the true point at which the +// current transaction commits. +static void +write_head(void) +{ + struct buf *buf = bread(log.dev, log.start); + struct logheader *hb = (struct logheader *) (buf->data); + int i; + hb->n = log.lh.n; + for (i = 0; i < log.lh.n; i++) { + hb->block[i] = log.lh.block[i]; + } + bwrite(buf); + brelse(buf); +} + +static void +recover_from_log(void) +{ + read_head(); + install_trans(); // if committed, copy from log to disk + log.lh.n = 0; + write_head(); // clear the log +} + +// called at the start of each FS system call. +void +begin_op(void) +{ + acquire(&log.lock); + while(1){ + if(log.committing){ + sleep(&log, &log.lock); + } else if(log.lh.n + (log.outstanding+1)*MAXOPBLOCKS > LOGSIZE){ + // this op might exhaust log space; wait for commit. + sleep(&log, &log.lock); + } else { + log.outstanding += 1; + release(&log.lock); + break; + } + } +} + +// called at the end of each FS system call. +// commits if this was the last outstanding operation. +void +end_op(void) +{ + int do_commit = 0; + + acquire(&log.lock); + log.outstanding -= 1; + if(log.committing) + panic("log.committing"); + if(log.outstanding == 0){ + do_commit = 1; + log.committing = 1; + } else { + // begin_op() may be waiting for log space, + // and decrementing log.outstanding has decreased + // the amount of reserved space. + wakeup(&log); + } + release(&log.lock); + + if(do_commit){ + // call commit w/o holding locks, since not allowed + // to sleep with locks. + commit(); + acquire(&log.lock); + log.committing = 0; + wakeup(&log); + release(&log.lock); + } +} + +// Copy modified blocks from cache to log. +static void +write_log(void) +{ + int tail; + + for (tail = 0; tail < log.lh.n; tail++) { + struct buf *to = bread(log.dev, log.start+tail+1); // log block + struct buf *from = bread(log.dev, log.lh.block[tail]); // cache block + memmove(to->data, from->data, BSIZE); + bwrite(to); // write the log + brelse(from); + brelse(to); + } +} + +static void +commit() +{ + if (log.lh.n > 0) { + write_log(); // Write modified blocks from cache to log + write_head(); // Write header to disk -- the real commit + install_trans(); // Now install writes to home locations + log.lh.n = 0; + write_head(); // Erase the transaction from the log + } +} + +// Caller has modified b->data and is done with the buffer. +// Record the block number and pin in the cache by increasing refcnt. +// commit()/write_log() will do the disk write. +// +// log_write() replaces bwrite(); a typical use is: +// bp = bread(...) +// modify bp->data[] +// log_write(bp) +// brelse(bp) +void +log_write(struct buf *b) +{ + int i; + + if (log.lh.n >= LOGSIZE || log.lh.n >= log.size - 1) + panic("too big a transaction"); + if (log.outstanding < 1) + panic("log_write outside of trans"); + + acquire(&log.lock); + for (i = 0; i < log.lh.n; i++) { + if (log.lh.block[i] == b->blockno) // log absorbtion + break; + } + log.lh.block[i] = b->blockno; + if (i == log.lh.n) { // Add new block to log? + bpin(b); + log.lh.n++; + } + release(&log.lock); +} + diff --git a/kernel/main.c b/kernel/main.c new file mode 100644 index 0000000..a936fd3 --- /dev/null +++ b/kernel/main.c @@ -0,0 +1,43 @@ +#include "types.h" +#include "param.h" +#include "memlayout.h" +#include "riscv.h" +#include "defs.h" + +volatile static int started = 0; + +// start() jumps here in supervisor mode on all CPUs. +void +main() +{ + if(cpuid() == 0){ + consoleinit(); + printfinit(); + printf("hart %d starting\n", cpuid()); + kinit(); // physical page allocator + kvminit(); // create kernel page table + kvminithart(); // turn on paging + procinit(); // process table + trapinit(); // trap vectors + trapinithart(); // install kernel trap vector + plicinit(); // set up interrupt controller + plicinithart(); // ask PLIC for device interrupts + binit(); // buffer cache + iinit(); // inode cache + fileinit(); // file table + virtio_disk_init(); // emulated hard disk + userinit(); // first user process + __sync_synchronize(); + started = 1; + } else { + while(started == 0) + ; + __sync_synchronize(); + printf("hart %d starting\n", cpuid()); + kvminithart(); // turn on paging + trapinithart(); // install kernel trap vector + plicinithart(); // ask PLIC for device interrupts + } + + scheduler(); +} diff --git a/kernel/memlayout.h b/kernel/memlayout.h new file mode 100644 index 0000000..8ffd538 --- /dev/null +++ b/kernel/memlayout.h @@ -0,0 +1,67 @@ +// Physical memory layout + +// qemu -machine virt is set up like this, +// based on qemu's hw/riscv/virt.c: +// +// 00001000 -- boot ROM, provided by qemu +// 02000000 -- CLINT +// 0C000000 -- PLIC +// 10000000 -- uart0 +// 10001000 -- virtio disk +// 80000000 -- boot ROM jumps here in machine mode +// -kernel loads the kernel here +// unused RAM after 80000000. + +// the kernel uses physical memory thus: +// 80000000 -- entry.S, then kernel text and data +// end -- start of kernel page allocation area +// PHYSTOP -- end RAM used by the kernel + +// qemu puts UART registers here in physical memory. +#define UART0 0x10000000L +#define UART0_IRQ 10 + +// virtio mmio interface +#define VIRTIO0 0x10001000 +#define VIRTIO0_IRQ 1 + +// local interrupt controller, which contains the timer. +#define CLINT 0x2000000L +#define CLINT_MTIMECMP(hartid) (CLINT + 0x4000 + 8*(hartid)) +#define CLINT_MTIME (CLINT + 0xBFF8) // cycles since boot. + +// qemu puts programmable interrupt controller here. +#define PLIC 0x0c000000L +#define PLIC_PRIORITY (PLIC + 0x0) +#define PLIC_PENDING (PLIC + 0x1000) +#define PLIC_MENABLE(hart) (PLIC + 0x2000 + (hart)*0x100) +#define PLIC_SENABLE(hart) (PLIC + 0x2080 + (hart)*0x100) +#define PLIC_MPRIORITY(hart) (PLIC + 0x200000 + (hart)*0x2000) +#define PLIC_SPRIORITY(hart) (PLIC + 0x201000 + (hart)*0x2000) +#define PLIC_MCLAIM(hart) (PLIC + 0x200004 + (hart)*0x2000) +#define PLIC_SCLAIM(hart) (PLIC + 0x201004 + (hart)*0x2000) + +// the kernel expects there to be RAM +// for use by the kernel and user pages +// from physical address 0x80000000 to PHYSTOP. +#define KERNBASE 0x80000000L +#define PHYSTOP (KERNBASE + 128*1024*1024) + +// map the trampoline page to the highest address, +// in both user and kernel space. +#define TRAMPOLINE (MAXVA - PGSIZE) + +// map kernel stacks beneath the trampoline, +// each surrounded by invalid guard pages. +#define KSTACK(p) (TRAMPOLINE - ((p)+1)* 2*PGSIZE) + +// User memory layout. +// Address zero first: +// text +// original data and bss +// fixed-size stack +// expandable heap +// ... +// TRAPFRAME (p->tf, used by the trampoline) +// TRAMPOLINE (the same page as in the kernel) +#define TRAPFRAME (TRAMPOLINE - PGSIZE) diff --git a/kernel/param.h b/kernel/param.h new file mode 100644 index 0000000..b5fdcb2 --- /dev/null +++ b/kernel/param.h @@ -0,0 +1,13 @@ +#define NPROC 64 // maximum number of processes +#define NCPU 8 // maximum number of CPUs +#define NOFILE 16 // open files per process +#define NFILE 100 // open files per system +#define NINODE 50 // maximum number of active i-nodes +#define NDEV 10 // maximum major device number +#define ROOTDEV 1 // device number of file system root disk +#define MAXARG 32 // max exec arguments +#define MAXOPBLOCKS 10 // max # of blocks any FS op writes +#define LOGSIZE (MAXOPBLOCKS*3) // max data blocks in on-disk log +#define NBUF (MAXOPBLOCKS*3) // size of disk block cache +#define FSSIZE 1000 // size of file system in blocks +#define MAXPATH 128 // maximum file path name diff --git a/kernel/pipe.c b/kernel/pipe.c new file mode 100644 index 0000000..e358283 --- /dev/null +++ b/kernel/pipe.c @@ -0,0 +1,127 @@ +#include "types.h" +#include "riscv.h" +#include "defs.h" +#include "param.h" +#include "spinlock.h" +#include "proc.h" +#include "fs.h" +#include "sleeplock.h" +#include "file.h" + +#define PIPESIZE 512 + +struct pipe { + struct spinlock lock; + char data[PIPESIZE]; + uint nread; // number of bytes read + uint nwrite; // number of bytes written + int readopen; // read fd is still open + int writeopen; // write fd is still open +}; + +int +pipealloc(struct file **f0, struct file **f1) +{ + struct pipe *pi; + + pi = 0; + *f0 = *f1 = 0; + if((*f0 = filealloc()) == 0 || (*f1 = filealloc()) == 0) + goto bad; + if((pi = (struct pipe*)kalloc()) == 0) + goto bad; + pi->readopen = 1; + pi->writeopen = 1; + pi->nwrite = 0; + pi->nread = 0; + initlock(&pi->lock, "pipe"); + (*f0)->type = FD_PIPE; + (*f0)->readable = 1; + (*f0)->writable = 0; + (*f0)->pipe = pi; + (*f1)->type = FD_PIPE; + (*f1)->readable = 0; + (*f1)->writable = 1; + (*f1)->pipe = pi; + return 0; + + bad: + if(pi) + kfree((char*)pi); + if(*f0) + fileclose(*f0); + if(*f1) + fileclose(*f1); + return -1; +} + +void +pipeclose(struct pipe *pi, int writable) +{ + acquire(&pi->lock); + if(writable){ + pi->writeopen = 0; + wakeup(&pi->nread); + } else { + pi->readopen = 0; + wakeup(&pi->nwrite); + } + if(pi->readopen == 0 && pi->writeopen == 0){ + release(&pi->lock); + kfree((char*)pi); + } else + release(&pi->lock); +} + +int +pipewrite(struct pipe *pi, uint64 addr, int n) +{ + int i; + char ch; + struct proc *pr = myproc(); + + acquire(&pi->lock); + for(i = 0; i < n; i++){ + while(pi->nwrite == pi->nread + PIPESIZE){ //DOC: pipewrite-full + if(pi->readopen == 0 || myproc()->killed){ + release(&pi->lock); + return -1; + } + wakeup(&pi->nread); + sleep(&pi->nwrite, &pi->lock); + } + if(copyin(pr->pagetable, &ch, addr + i, 1) == -1) + break; + pi->data[pi->nwrite++ % PIPESIZE] = ch; + } + wakeup(&pi->nread); + release(&pi->lock); + return n; +} + +int +piperead(struct pipe *pi, uint64 addr, int n) +{ + int i; + struct proc *pr = myproc(); + char ch; + + acquire(&pi->lock); + while(pi->nread == pi->nwrite && pi->writeopen){ //DOC: pipe-empty + if(myproc()->killed){ + release(&pi->lock); + return -1; + } + sleep(&pi->nread, &pi->lock); //DOC: piperead-sleep + } + for(i = 0; i < n; i++){ //DOC: piperead-copy + if(pi->nread == pi->nwrite) + break; + ch = pi->data[pi->nread++ % PIPESIZE]; + if(copyout(pr->pagetable, addr + i, &ch, 1) == -1) + break; + } + wakeup(&pi->nwrite); //DOC: piperead-wakeup + release(&pi->lock); + return i; +} diff --git a/kernel/plic.c b/kernel/plic.c new file mode 100644 index 0000000..b569492 --- /dev/null +++ b/kernel/plic.c @@ -0,0 +1,62 @@ +#include "types.h" +#include "param.h" +#include "memlayout.h" +#include "riscv.h" +#include "defs.h" + +// +// the riscv Platform Level Interrupt Controller (PLIC). +// + +void +plicinit(void) +{ + // set desired IRQ priorities non-zero (otherwise disabled). + *(uint32*)(PLIC + UART0_IRQ*4) = 1; + *(uint32*)(PLIC + VIRTIO0_IRQ*4) = 1; +} + +void +plicinithart(void) +{ + int hart = cpuid(); + + // set uart's enable bit for this hart's S-mode. + *(uint32*)PLIC_SENABLE(hart)= (1 << UART0_IRQ) | (1 << VIRTIO0_IRQ); + + // set this hart's S-mode priority threshold to 0. + *(uint32*)PLIC_SPRIORITY(hart) = 0; +} + +// return a bitmap of which IRQs are waiting +// to be served. +uint64 +plic_pending(void) +{ + uint64 mask; + + //mask = *(uint32*)(PLIC + 0x1000); + //mask |= (uint64)*(uint32*)(PLIC + 0x1004) << 32; + mask = *(uint64*)PLIC_PENDING; + + return mask; +} + +// ask the PLIC what interrupt we should serve. +int +plic_claim(void) +{ + int hart = cpuid(); + //int irq = *(uint32*)(PLIC + 0x201004); + int irq = *(uint32*)PLIC_SCLAIM(hart); + return irq; +} + +// tell the PLIC we've served this IRQ. +void +plic_complete(int irq) +{ + int hart = cpuid(); + //*(uint32*)(PLIC + 0x201004) = irq; + *(uint32*)PLIC_SCLAIM(hart) = irq; +} diff --git a/kernel/printf.c b/kernel/printf.c new file mode 100644 index 0000000..777cc5f --- /dev/null +++ b/kernel/printf.c @@ -0,0 +1,134 @@ +// +// formatted console output -- printf, panic. +// + +#include <stdarg.h> + +#include "types.h" +#include "param.h" +#include "spinlock.h" +#include "sleeplock.h" +#include "fs.h" +#include "file.h" +#include "memlayout.h" +#include "riscv.h" +#include "defs.h" +#include "proc.h" + +volatile int panicked = 0; + +// lock to avoid interleaving concurrent printf's. +static struct { + struct spinlock lock; + int locking; +} pr; + +static char digits[] = "0123456789abcdef"; + +static void +printint(int xx, int base, int sign) +{ + char buf[16]; + int i; + uint x; + + if(sign && (sign = xx < 0)) + x = -xx; + else + x = xx; + + i = 0; + do { + buf[i++] = digits[x % base]; + } while((x /= base) != 0); + + if(sign) + buf[i++] = '-'; + + while(--i >= 0) + consputc(buf[i]); +} + +static void +printptr(uint64 x) +{ + int i; + consputc('0'); + consputc('x'); + for (i = 0; i < (sizeof(uint64) * 2); i++, x <<= 4) + consputc(digits[x >> (sizeof(uint64) * 8 - 4)]); +} + +// Print to the console. only understands %d, %x, %p, %s. +void +printf(char *fmt, ...) +{ + va_list ap; + int i, c, locking; + char *s; + + locking = pr.locking; + if(locking) + acquire(&pr.lock); + + if (fmt == 0) + panic("null fmt"); + + va_start(ap, fmt); + for(i = 0; (c = fmt[i] & 0xff) != 0; i++){ + if(c != '%'){ + consputc(c); + continue; + } + c = fmt[++i] & 0xff; + if(c == 0) + break; + switch(c){ + case 'd': + printint(va_arg(ap, int), 10, 1); + break; + case 'x': + printint(va_arg(ap, int), 16, 1); + break; + case 'p': + printptr(va_arg(ap, uint64)); + break; + case 's': + if((s = va_arg(ap, char*)) == 0) + s = "(null)"; + for(; *s; s++) + consputc(*s); + break; + case '%': + consputc('%'); + break; + default: + // Print unknown % sequence to draw attention. + consputc('%'); + consputc(c); + break; + } + } + + if(locking) + release(&pr.lock); +} + +void +panic(char *s) +{ + pr.locking = 0; + printf("panic: "); + printf(s); + printf("\n"); + panicked = 1; // freeze other CPUs + for(;;) + ; +} + +void +printfinit(void) +{ + initlock(&pr.lock, "pr"); + pr.locking = 1; +} diff --git a/kernel/proc.c b/kernel/proc.c new file mode 100644 index 0000000..428fdb0 --- /dev/null +++ b/kernel/proc.c @@ -0,0 +1,647 @@ +#include "types.h" +#include "param.h" +#include "memlayout.h" +#include "riscv.h" +#include "spinlock.h" +#include "proc.h" +#include "defs.h" + +struct cpu cpus[NCPU]; + +struct proc proc[NPROC]; + +struct proc *initproc; + +int nextpid = 1; +struct spinlock pid_lock; + +extern void forkret(void); +static void wakeup1(struct proc *chan); + +extern char trampoline[]; // trampoline.S + +void +procinit(void) +{ + struct proc *p; + + initlock(&pid_lock, "nextpid"); + for(p = proc; p < &proc[NPROC]; p++) { + initlock(&p->lock, "proc"); + + // Allocate a page for the process's kernel stack. + // Map it high in memory, followed by an invalid + // guard page. + char *pa = kalloc(); + if(pa == 0) + panic("kalloc"); + uint64 va = KSTACK((int) (p - proc)); + kvmmap(va, (uint64)pa, PGSIZE, PTE_R | PTE_W); + p->kstack = va; + } + kvminithart(); +} + +// Must be called with interrupts disabled, +// to prevent race with process being moved +// to a different CPU. +int +cpuid() +{ + int id = r_tp(); + return id; +} + +// Return this CPU's cpu struct. +// Interrupts must be disabled. +struct cpu* +mycpu(void) { + int id = cpuid(); + struct cpu *c = &cpus[id]; + return c; +} + +// Return the current struct proc *, or zero if none. +struct proc* +myproc(void) { + push_off(); + struct cpu *c = mycpu(); + struct proc *p = c->proc; + pop_off(); + return p; +} + +int +allocpid() { + int pid; + + acquire(&pid_lock); + pid = nextpid; + nextpid = nextpid + 1; + release(&pid_lock); + + return pid; +} + +// Look in the process table for an UNUSED proc. +// If found, initialize state required to run in the kernel, +// and return with p->lock held. +// If there are no free procs, return 0. +static struct proc* +allocproc(void) +{ + struct proc *p; + + for(p = proc; p < &proc[NPROC]; p++) { + acquire(&p->lock); + if(p->state == UNUSED) { + goto found; + } else { + release(&p->lock); + } + } + return 0; + +found: + p->pid = allocpid(); + + // Allocate a trapframe page. + if((p->tf = (struct trapframe *)kalloc()) == 0){ + release(&p->lock); + return 0; + } + + // An empty user page table. + p->pagetable = proc_pagetable(p); + + // Set up new context to start executing at forkret, + // which returns to user space. + memset(&p->context, 0, sizeof p->context); + p->context.ra = (uint64)forkret; + p->context.sp = p->kstack + PGSIZE; + + return p; +} + +// free a proc structure and the data hanging from it, +// including user pages. +// p->lock must be held. +static void +freeproc(struct proc *p) +{ + if(p->tf) + kfree((void*)p->tf); + p->tf = 0; + if(p->pagetable) + proc_freepagetable(p->pagetable, p->sz); + p->pagetable = 0; + p->sz = 0; + p->pid = 0; + p->parent = 0; + p->name[0] = 0; + p->chan = 0; + p->killed = 0; + p->state = UNUSED; +} + +// Create a page table for a given process, +// with no user pages, but with trampoline pages. +pagetable_t +proc_pagetable(struct proc *p) +{ + pagetable_t pagetable; + + // An empty page table. + pagetable = uvmcreate(); + + // map the trampoline code (for system call return) + // at the highest user virtual address. + // only the supervisor uses it, on the way + // to/from user space, so not PTE_U. + mappages(pagetable, TRAMPOLINE, PGSIZE, + (uint64)trampoline, PTE_R | PTE_X); + + // map the trapframe just below TRAMPOLINE, for trampoline.S. + mappages(pagetable, TRAPFRAME, PGSIZE, + (uint64)(p->tf), PTE_R | PTE_W); + + return pagetable; +} + +// Free a process's page table, and free the +// physical memory it refers to. +void +proc_freepagetable(pagetable_t pagetable, uint64 sz) +{ + uvmunmap(pagetable, TRAMPOLINE, PGSIZE, 0); + uvmunmap(pagetable, TRAPFRAME, PGSIZE, 0); + if(sz > 0) + uvmfree(pagetable, sz); +} + +// a user program that calls exec("/init") +// od -t xC initcode +uchar initcode[] = { + 0x17, 0x05, 0x00, 0x00, 0x13, 0x05, 0x05, 0x02, + 0x97, 0x05, 0x00, 0x00, 0x93, 0x85, 0x05, 0x02, + 0x9d, 0x48, 0x73, 0x00, 0x00, 0x00, 0x89, 0x48, + 0x73, 0x00, 0x00, 0x00, 0xef, 0xf0, 0xbf, 0xff, + 0x2f, 0x69, 0x6e, 0x69, 0x74, 0x00, 0x00, 0x01, + 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00 +}; + +// Set up first user process. +void +userinit(void) +{ + struct proc *p; + + p = allocproc(); + initproc = p; + + // allocate one user page and copy init's instructions + // and data into it. + uvminit(p->pagetable, initcode, sizeof(initcode)); + p->sz = PGSIZE; + + // prepare for the very first "return" from kernel to user. + p->tf->epc = 0; // user program counter + p->tf->sp = PGSIZE; // user stack pointer + + safestrcpy(p->name, "initcode", sizeof(p->name)); + p->cwd = namei("/"); + + p->state = RUNNABLE; + + release(&p->lock); +} + +// Grow or shrink user memory by n bytes. +// Return 0 on success, -1 on failure. +int +growproc(int n) +{ + uint sz; + struct proc *p = myproc(); + + sz = p->sz; + if(n > 0){ + if((sz = uvmalloc(p->pagetable, sz, sz + n)) == 0) { + return -1; + } + } else if(n < 0){ + if((sz = uvmdealloc(p->pagetable, sz, sz + n)) == 0) { + return -1; + } + } + p->sz = sz; + return 0; +} + +// Create a new process, copying the parent. +// Sets up child kernel stack to return as if from fork() system call. +int +fork(void) +{ + int i, pid; + struct proc *np; + struct proc *p = myproc(); + + // Allocate process. + if((np = allocproc()) == 0){ + return -1; + } + + // Copy user memory from parent to child. + if(uvmcopy(p->pagetable, np->pagetable, p->sz) < 0){ + freeproc(np); + release(&np->lock); + return -1; + } + np->sz = p->sz; + + np->parent = p; + + // copy saved user registers. + *(np->tf) = *(p->tf); + + // Cause fork to return 0 in the child. + np->tf->a0 = 0; + + // increment reference counts on open file descriptors. + for(i = 0; i < NOFILE; i++) + if(p->ofile[i]) + np->ofile[i] = filedup(p->ofile[i]); + np->cwd = idup(p->cwd); + + safestrcpy(np->name, p->name, sizeof(p->name)); + + pid = np->pid; + + np->state = RUNNABLE; + + release(&np->lock); + + return pid; +} + +// Pass p's abandoned children to init. +// Caller must hold p->lock and parent->lock. +void +reparent(struct proc *p, struct proc *parent) { + struct proc *pp; + int child_of_init = (p->parent == initproc); + + for(pp = proc; pp < &proc[NPROC]; pp++){ + // this code uses pp->parent without holding pp->lock. + // acquiring the lock first could cause a deadlock + // if pp or a child of pp were also in exit() + // and about to try to lock p. + if(pp->parent == p){ + // pp->parent can't change between the check and the acquire() + // because only the parent changes it, and we're the parent. + acquire(&pp->lock); + pp->parent = initproc; + if(pp->state == ZOMBIE) { + if(!child_of_init) + acquire(&initproc->lock); + wakeup1(initproc); + if(!child_of_init) + release(&initproc->lock); + } + release(&pp->lock); + } + } +} + +// Exit the current process. Does not return. +// An exited process remains in the zombie state +// until its parent calls wait(). +void +exit(void) +{ + struct proc *p = myproc(); + + if(p == initproc) + panic("init exiting"); + + // Close all open files. + for(int fd = 0; fd < NOFILE; fd++){ + if(p->ofile[fd]){ + struct file *f = p->ofile[fd]; + fileclose(f); + p->ofile[fd] = 0; + } + } + + begin_op(); + iput(p->cwd); + end_op(); + p->cwd = 0; + + acquire(&p->parent->lock); + + acquire(&p->lock); + + // Give any children to init. + reparent(p, p->parent); + + // Parent might be sleeping in wait(). + wakeup1(p->parent); + + p->state = ZOMBIE; + + release(&p->parent->lock); + + // Jump into the scheduler, never to return. + sched(); + panic("zombie exit"); +} + +// Wait for a child process to exit and return its pid. +// Return -1 if this process has no children. +int +wait(void) +{ + struct proc *np; + int havekids, pid; + struct proc *p = myproc(); + + // hold p->lock for the whole time to avoid lost + // wakeups from a child's exit(). + acquire(&p->lock); + + for(;;){ + // Scan through table looking for exited children. + havekids = 0; + for(np = proc; np < &proc[NPROC]; np++){ + // this code uses np->parent without holding np->lock. + // acquiring the lock first would cause a deadlock, + // since np might be an ancestor, and we already hold p->lock. + if(np->parent == p){ + // np->parent can't change between the check and the acquire() + // because only the parent changes it, and we're the parent. + acquire(&np->lock); + havekids = 1; + if(np->state == ZOMBIE){ + // Found one. + pid = np->pid; + freeproc(np); + release(&np->lock); + release(&p->lock); + return pid; + } + release(&np->lock); + } + } + + // No point waiting if we don't have any children. + if(!havekids || p->killed){ + release(&p->lock); + return -1; + } + + // Wait for a child to exit. + sleep(p, &p->lock); //DOC: wait-sleep + } +} + +// Per-CPU process scheduler. +// Each CPU calls scheduler() after setting itself up. +// Scheduler never returns. It loops, doing: +// - choose a process to run. +// - swtch to start running that process. +// - eventually that process transfers control +// via swtch back to the scheduler. +void +scheduler(void) +{ + struct proc *p; + struct cpu *c = mycpu(); + + c->proc = 0; + for(;;){ + // Avoid deadlock by ensuring that devices can interrupt. + intr_on(); + + for(p = proc; p < &proc[NPROC]; p++) { + acquire(&p->lock); + if(p->state == RUNNABLE) { + // Switch to chosen process. It is the process's job + // to release its lock and then reacquire it + // before jumping back to us. + p->state = RUNNING; + c->proc = p; + swtch(&c->scheduler, &p->context); + + // Process is done running for now. + // It should have changed its p->state before coming back. + c->proc = 0; + } + release(&p->lock); + } + } +} + +// Switch to scheduler. Must hold only p->lock +// and have changed proc->state. Saves and restores +// intena because intena is a property of this +// kernel thread, not this CPU. It should +// be proc->intena and proc->noff, but that would +// break in the few places where a lock is held but +// there's no process. +void +sched(void) +{ + int intena; + struct proc *p = myproc(); + + if(!holding(&p->lock)) + panic("sched p->lock"); + if(mycpu()->noff != 1) + panic("sched locks"); + if(p->state == RUNNING) + panic("sched running"); + if(intr_get()) + panic("sched interruptible"); + + intena = mycpu()->intena; + swtch(&p->context, &mycpu()->scheduler); + mycpu()->intena = intena; +} + +// Give up the CPU for one scheduling round. +void +yield(void) +{ + struct proc *p = myproc(); + acquire(&p->lock); + p->state = RUNNABLE; + sched(); + release(&p->lock); +} + +// A fork child's very first scheduling by scheduler() +// will swtch to forkret. +void +forkret(void) +{ + static int first = 1; + + // Still holding p->lock from scheduler. + release(&myproc()->lock); + + if (first) { + // File system initialization must be run in the context of a + // regular process (e.g., because it calls sleep), and thus cannot + // be run from main(). + first = 0; + fsinit(ROOTDEV); + } + + usertrapret(); +} + +// Atomically release lock and sleep on chan. +// Reacquires lock when awakened. +void +sleep(void *chan, struct spinlock *lk) +{ + struct proc *p = myproc(); + + // Must acquire p->lock in order to + // change p->state and then call sched. + // Once we hold p->lock, we can be + // guaranteed that we won't miss any wakeup + // (wakeup locks p->lock), + // so it's okay to release lk. + if(lk != &p->lock){ //DOC: sleeplock0 + acquire(&p->lock); //DOC: sleeplock1 + release(lk); + } + + // Go to sleep. + p->chan = chan; + p->state = SLEEPING; + + sched(); + + // Tidy up. + p->chan = 0; + + // Reacquire original lock. + if(lk != &p->lock){ + release(&p->lock); + acquire(lk); + } +} + +// Wake up all processes sleeping on chan. +// Must be called without any p->lock. +void +wakeup(void *chan) +{ + struct proc *p; + + for(p = proc; p < &proc[NPROC]; p++) { + acquire(&p->lock); + if(p->state == SLEEPING && p->chan == chan) { + p->state = RUNNABLE; + } + release(&p->lock); + } +} + +// Wake up p if it is sleeping in wait(); used by exit(). +// Caller must hold p->lock. +static void +wakeup1(struct proc *p) +{ + if(p->chan == p && p->state == SLEEPING) { + p->state = RUNNABLE; + } +} + +// Kill the process with the given pid. +// The victim won't exit until it tries to return +// to user space (see usertrap() in trap.c). +int +kill(int pid) +{ + struct proc *p; + + for(p = proc; p < &proc[NPROC]; p++){ + acquire(&p->lock); + if(p->pid == pid){ + p->killed = 1; + if(p->state == SLEEPING){ + // Wake process from sleep(). + p->state = RUNNABLE; + } + release(&p->lock); + return 0; + } + release(&p->lock); + } + return -1; +} + +// Copy to either a user address, or kernel address, +// depending on usr_dst. +// Returns 0 on success, -1 on error. +int +either_copyout(int user_dst, uint64 dst, void *src, uint64 len) +{ + struct proc *p = myproc(); + if(user_dst){ + return copyout(p->pagetable, dst, src, len); + } else { + memmove((char *)dst, src, len); + return 0; + } +} + +// Copy from either a user address, or kernel address, +// depending on usr_src. +// Returns 0 on success, -1 on error. +int +either_copyin(void *dst, int user_src, uint64 src, uint64 len) +{ + struct proc *p = myproc(); + if(user_src){ + return copyin(p->pagetable, dst, src, len); + } else { + memmove(dst, (char*)src, len); + return 0; + } +} + +// Print a process listing to console. For debugging. +// Runs when user types ^P on console. +// No lock to avoid wedging a stuck machine further. +void +procdump(void) +{ + static char *states[] = { + [UNUSED] "unused", + [SLEEPING] "sleep ", + [RUNNABLE] "runble", + [RUNNING] "run ", + [ZOMBIE] "zombie" + }; + struct proc *p; + char *state; + + printf("\n"); + for(p = proc; p < &proc[NPROC]; p++){ + if(p->state == UNUSED) + continue; + if(p->state >= 0 && p->state < NELEM(states) && states[p->state]) + state = states[p->state]; + else + state = "???"; + printf("%d %s %s", p->pid, state, p->name); + printf("\n"); + } +} diff --git a/kernel/proc.h b/kernel/proc.h new file mode 100644 index 0000000..655d79f --- /dev/null +++ b/kernel/proc.h @@ -0,0 +1,105 @@ +// Saved registers for kernel context switches. +struct context { + uint64 ra; + uint64 sp; + + // callee-saved + uint64 s0; + uint64 s1; + uint64 s2; + uint64 s3; + uint64 s4; + uint64 s5; + uint64 s6; + uint64 s7; + uint64 s8; + uint64 s9; + uint64 s10; + uint64 s11; +}; + +// Per-CPU state. +struct cpu { + struct proc *proc; // The process running on this cpu, or null. + struct context scheduler; // swtch() here to enter scheduler(). + int noff; // Depth of push_off() nesting. + int intena; // Were interrupts enabled before push_off()? +}; + +extern struct cpu cpus[NCPU]; + +// per-process data for the trap handling code in trampoline.S. +// sits in a page by itself just under the trampoline page in the +// user page table. not specially mapped in the kernel page table. +// the sscratch register points here. +// uservec in trampoline.S saves user registers in the trapframe, +// then initializes registers from the trapframe's +// kernel_sp, kernel_hartid, kernel_satp, and jumps to kernel_trap. +// usertrapret() and userret in trampoline.S set up +// the trapframe's kernel_*, restore user registers from the +// trapframe, switch to the user page table, and enter user space. +// the trapframe includes callee-saved user registers like s0-s11 because the +// return-to-user path via usertrapret() doesn't return through +// the entire kernel call stack. +struct trapframe { + /* 0 */ uint64 kernel_satp; // kernel page table + /* 8 */ uint64 kernel_sp; // top of process's kernel stack + /* 16 */ uint64 kernel_trap; // usertrap() + /* 24 */ uint64 epc; // saved user program counter + /* 32 */ uint64 kernel_hartid; // saved kernel tp + /* 40 */ uint64 ra; + /* 48 */ uint64 sp; + /* 56 */ uint64 gp; + /* 64 */ uint64 tp; + /* 72 */ uint64 t0; + /* 80 */ uint64 t1; + /* 88 */ uint64 t2; + /* 96 */ uint64 s0; + /* 104 */ uint64 s1; + /* 112 */ uint64 a0; + /* 120 */ uint64 a1; + /* 128 */ uint64 a2; + /* 136 */ uint64 a3; + /* 144 */ uint64 a4; + /* 152 */ uint64 a5; + /* 160 */ uint64 a6; + /* 168 */ uint64 a7; + /* 176 */ uint64 s2; + /* 184 */ uint64 s3; + /* 192 */ uint64 s4; + /* 200 */ uint64 s5; + /* 208 */ uint64 s6; + /* 216 */ uint64 s7; + /* 224 */ uint64 s8; + /* 232 */ uint64 s9; + /* 240 */ uint64 s10; + /* 248 */ uint64 s11; + /* 256 */ uint64 t3; + /* 264 */ uint64 t4; + /* 272 */ uint64 t5; + /* 280 */ uint64 t6; +}; + +enum procstate { UNUSED, SLEEPING, RUNNABLE, RUNNING, ZOMBIE }; + +// Per-process state +struct proc { + struct spinlock lock; + + // p->lock must be held when using these: + enum procstate state; // Process state + struct proc *parent; // Parent process + void *chan; // If non-zero, sleeping on chan + int killed; // If non-zero, have been killed + int pid; // Process ID + + // these are private to the process, so p->lock need not be held. + uint64 kstack; // Bottom of kernel stack for this process + uint64 sz; // Size of process memory (bytes) + pagetable_t pagetable; // Page table + struct trapframe *tf; // data page for trampoline.S + struct context context; // swtch() here to run process + struct file *ofile[NOFILE]; // Open files + struct inode *cwd; // Current directory + char name[16]; // Process name (debugging) +}; diff --git a/kernel/ramdisk.c b/kernel/ramdisk.c new file mode 100644 index 0000000..9901294 --- /dev/null +++ b/kernel/ramdisk.c @@ -0,0 +1,45 @@ +// +// ramdisk that uses the disk image loaded by qemu -rdinit fs.img +// + +#include "types.h" +#include "riscv.h" +#include "defs.h" +#include "param.h" +#include "memlayout.h" +#include "spinlock.h" +#include "sleeplock.h" +#include "fs.h" +#include "buf.h" + +void +ramdiskinit(void) +{ +} + +// If B_DIRTY is set, write buf to disk, clear B_DIRTY, set B_VALID. +// Else if B_VALID is not set, read buf from disk, set B_VALID. +void +ramdiskrw(struct buf *b) +{ + if(!holdingsleep(&b->lock)) + panic("ramdiskrw: buf not locked"); + if((b->flags & (B_VALID|B_DIRTY)) == B_VALID) + panic("ramdiskrw: nothing to do"); + + if(b->blockno >= FSSIZE) + panic("ramdiskrw: blockno too big"); + + uint64 diskaddr = b->blockno * BSIZE; + char *addr = (char *)RAMDISK + diskaddr; + + if(b->flags & B_DIRTY){ + // write + memmove(addr, b->data, BSIZE); + b->flags &= ~B_DIRTY; + } else { + // read + memmove(b->data, addr, BSIZE); + b->flags |= B_VALID; + } +} diff --git a/kernel/riscv.h b/kernel/riscv.h new file mode 100644 index 0000000..0f83db6 --- /dev/null +++ b/kernel/riscv.h @@ -0,0 +1,358 @@ +// which hart (core) is this? +static inline uint64 +r_mhartid() +{ + uint64 x; + asm volatile("csrr %0, mhartid" : "=r" (x) ); + return x; +} + +// Machine Status Register, mstatus + +#define MSTATUS_MPP_MASK (3L << 11) // previous mode. +#define MSTATUS_MPP_M (3L << 11) +#define MSTATUS_MPP_S (1L << 11) +#define MSTATUS_MPP_U (0L << 11) +#define MSTATUS_MIE (1L << 3) // machine-mode interrupt enable. + +static inline uint64 +r_mstatus() +{ + uint64 x; + asm volatile("csrr %0, mstatus" : "=r" (x) ); + return x; +} + +static inline void +w_mstatus(uint64 x) +{ + asm volatile("csrw mstatus, %0" : : "r" (x)); +} + +// machine exception program counter, holds the +// instruction address to which a return from +// exception will go. +static inline void +w_mepc(uint64 x) +{ + asm volatile("csrw mepc, %0" : : "r" (x)); +} + +// Supervisor Status Register, sstatus + +#define SSTATUS_SPP (1L << 8) // Previous mode, 1=Supervisor, 0=User +#define SSTATUS_SPIE (1L << 5) // Supervisor Previous Interrupt Enable +#define SSTATUS_UPIE (1L << 4) // User Previous Interrupt Enable +#define SSTATUS_SIE (1L << 1) // Supervisor Interrupt Enable +#define SSTATUS_UIE (1L << 0) // User Interrupt Enable + +static inline uint64 +r_sstatus() +{ + uint64 x; + asm volatile("csrr %0, sstatus" : "=r" (x) ); + return x; +} + +static inline void +w_sstatus(uint64 x) +{ + asm volatile("csrw sstatus, %0" : : "r" (x)); +} + +// Supervisor Interrupt Pending +static inline uint64 +r_sip() +{ + uint64 x; + asm volatile("csrr %0, sip" : "=r" (x) ); + return x; +} + +static inline void +w_sip(uint64 x) +{ + asm volatile("csrw sip, %0" : : "r" (x)); +} + +// Supervisor Interrupt Enable +#define SIE_SEIE (1L << 9) // external +#define SIE_STIE (1L << 5) // timer +#define SIE_SSIE (1L << 1) // software +static inline uint64 +r_sie() +{ + uint64 x; + asm volatile("csrr %0, sie" : "=r" (x) ); + return x; +} + +static inline void +w_sie(uint64 x) +{ + asm volatile("csrw sie, %0" : : "r" (x)); +} + +// Machine-mode Interrupt Enable +#define MIE_MEIE (1L << 11) // external +#define MIE_MTIE (1L << 7) // timer +#define MIE_MSIE (1L << 3) // software +static inline uint64 +r_mie() +{ + uint64 x; + asm volatile("csrr %0, mie" : "=r" (x) ); + return x; +} + +static inline void +w_mie(uint64 x) +{ + asm volatile("csrw mie, %0" : : "r" (x)); +} + +// machine exception program counter, holds the +// instruction address to which a return from +// exception will go. +static inline void +w_sepc(uint64 x) +{ + asm volatile("csrw sepc, %0" : : "r" (x)); +} + +static inline uint64 +r_sepc() +{ + uint64 x; + asm volatile("csrr %0, sepc" : "=r" (x) ); + return x; +} + +// Machine Exception Delegation +static inline uint64 +r_medeleg() +{ + uint64 x; + asm volatile("csrr %0, medeleg" : "=r" (x) ); + return x; +} + +static inline void +w_medeleg(uint64 x) +{ + asm volatile("csrw medeleg, %0" : : "r" (x)); +} + +// Machine Interrupt Delegation +static inline uint64 +r_mideleg() +{ + uint64 x; + asm volatile("csrr %0, mideleg" : "=r" (x) ); + return x; +} + +static inline void +w_mideleg(uint64 x) +{ + asm volatile("csrw mideleg, %0" : : "r" (x)); +} + +// Supervisor Trap-Vector Base Address +// low two bits are mode. +static inline void +w_stvec(uint64 x) +{ + asm volatile("csrw stvec, %0" : : "r" (x)); +} + +static inline uint64 +r_stvec() +{ + uint64 x; + asm volatile("csrr %0, stvec" : "=r" (x) ); + return x; +} + +// Machine-mode interrupt vector +static inline void +w_mtvec(uint64 x) +{ + asm volatile("csrw mtvec, %0" : : "r" (x)); +} + +// use riscv's sv39 page table scheme. +#define SATP_SV39 (8L << 60) + +#define MAKE_SATP(pagetable) (SATP_SV39 | (((uint64)pagetable) >> 12)) + +// supervisor address translation and protection; +// holds the address of the page table. +static inline void +w_satp(uint64 x) +{ + asm volatile("csrw satp, %0" : : "r" (x)); +} + +static inline uint64 +r_satp() +{ + uint64 x; + asm volatile("csrr %0, satp" : "=r" (x) ); + return x; +} + +// Supervisor Scratch register, for early trap handler in trampoline.S. +static inline void +w_sscratch(uint64 x) +{ + asm volatile("csrw sscratch, %0" : : "r" (x)); +} + +static inline void +w_mscratch(uint64 x) +{ + asm volatile("csrw mscratch, %0" : : "r" (x)); +} + +// Supervisor Trap Cause +static inline uint64 +r_scause() +{ + uint64 x; + asm volatile("csrr %0, scause" : "=r" (x) ); + return x; +} + +// Supervisor Trap Value +static inline uint64 +r_stval() +{ + uint64 x; + asm volatile("csrr %0, stval" : "=r" (x) ); + return x; +} + +// Machine-mode Counter-Enable +static inline void +w_mcounteren(uint64 x) +{ + asm volatile("csrw mcounteren, %0" : : "r" (x)); +} + +static inline uint64 +r_mcounteren() +{ + uint64 x; + asm volatile("csrr %0, mcounteren" : "=r" (x) ); + return x; +} + +// machine-mode cycle counter +static inline uint64 +r_time() +{ + uint64 x; + asm volatile("csrr %0, time" : "=r" (x) ); + return x; +} + +// enable device interrupts +static inline void +intr_on() +{ + w_sie(r_sie() | SIE_SEIE | SIE_STIE | SIE_SSIE); + w_sstatus(r_sstatus() | SSTATUS_SIE); +} + +// disable device interrupts +static inline void +intr_off() +{ + w_sstatus(r_sstatus() & ~SSTATUS_SIE); +} + +// are device interrupts enabled? +static inline int +intr_get() +{ + uint64 x = r_sstatus(); + return (x & SSTATUS_SIE) != 0; +} + +static inline uint64 +r_sp() +{ + uint64 x; + asm volatile("mv %0, sp" : "=r" (x) ); + return x; +} + +// read and write tp, the thread pointer, which holds +// this core's hartid (core number), the index into cpus[]. +static inline uint64 +r_tp() +{ + uint64 x; + asm volatile("mv %0, tp" : "=r" (x) ); + return x; +} + +static inline void +w_tp(uint64 x) +{ + asm volatile("mv tp, %0" : : "r" (x)); +} + +static inline uint64 +r_ra() +{ + uint64 x; + asm volatile("mv %0, ra" : "=r" (x) ); + return x; +} + +// tell the machine to finish any previous writes to +// PTEs, so that a subsequent use of a virtual +// address or load of the SATP will see those writes. +// perhaps this also flushes the TLB. +static inline void +sfence_vma() +{ + // the zero, zero means flush all TLB entries. + asm volatile("sfence.vma zero, zero"); +} + + +#define PGSIZE 4096 // bytes per page +#define PGSHIFT 12 // bits of offset within a page + +#define PGROUNDUP(sz) (((sz)+PGSIZE-1) & ~(PGSIZE-1)) +#define PGROUNDDOWN(a) (((a)) & ~(PGSIZE-1)) + +#define PTE_V (1L << 0) // valid +#define PTE_R (1L << 1) +#define PTE_W (1L << 2) +#define PTE_X (1L << 3) +#define PTE_U (1L << 4) // 1 -> user can access + +// shift a physical address to the right place for a PTE. +#define PA2PTE(pa) ((((uint64)pa) >> 12) << 10) + +#define PTE2PA(pte) (((pte) >> 10) << 12) + +#define PTE_FLAGS(pte) ((pte) & (PTE_V|PTE_R|PTE_W|PTE_X|PTE_U)) + +// extract the three 9-bit page table indices from a virtual address. +#define PXMASK 0x1FF // 9 bits +#define PXSHIFT(level) (PGSHIFT+(9*(level))) +#define PX(level, va) ((((uint64) (va)) >> PXSHIFT(level)) & PXMASK) + +// one beyond the highest possible virtual address. +// MAXVA is actually one bit less than the max allowed by +// Sv39, to avoid having to sign-extend virtual addresses +// that have the high bit set. +#define MAXVA (1L << (9 + 9 + 9 + 12 - 1)) + +typedef uint64 pte_t; +typedef uint64 *pagetable_t; // 512 PTEs diff --git a/kernel/sleeplock.c b/kernel/sleeplock.c new file mode 100644 index 0000000..81de585 --- /dev/null +++ b/kernel/sleeplock.c @@ -0,0 +1,55 @@ +// Sleeping locks + +#include "types.h" +#include "riscv.h" +#include "defs.h" +#include "param.h" +#include "memlayout.h" +#include "spinlock.h" +#include "proc.h" +#include "sleeplock.h" + +void +initsleeplock(struct sleeplock *lk, char *name) +{ + initlock(&lk->lk, "sleep lock"); + lk->name = name; + lk->locked = 0; + lk->pid = 0; +} + +void +acquiresleep(struct sleeplock *lk) +{ + acquire(&lk->lk); + while (lk->locked) { + sleep(lk, &lk->lk); + } + lk->locked = 1; + lk->pid = myproc()->pid; + release(&lk->lk); +} + +void +releasesleep(struct sleeplock *lk) +{ + acquire(&lk->lk); + lk->locked = 0; + lk->pid = 0; + wakeup(lk); + release(&lk->lk); +} + +int +holdingsleep(struct sleeplock *lk) +{ + int r; + + acquire(&lk->lk); + r = lk->locked && (lk->pid == myproc()->pid); + release(&lk->lk); + return r; +} + + + diff --git a/kernel/sleeplock.h b/kernel/sleeplock.h new file mode 100644 index 0000000..110e6f3 --- /dev/null +++ b/kernel/sleeplock.h @@ -0,0 +1,10 @@ +// Long-term locks for processes +struct sleeplock { + uint locked; // Is the lock held? + struct spinlock lk; // spinlock protecting this sleep lock + + // For debugging: + char *name; // Name of lock. + int pid; // Process holding lock +}; + diff --git a/kernel/spinlock.c b/kernel/spinlock.c new file mode 100644 index 0000000..563532e --- /dev/null +++ b/kernel/spinlock.c @@ -0,0 +1,108 @@ +// Mutual exclusion spin locks. + +#include "types.h" +#include "param.h" +#include "memlayout.h" +#include "spinlock.h" +#include "riscv.h" +#include "proc.h" +#include "defs.h" + +void +initlock(struct spinlock *lk, char *name) +{ + lk->name = name; + lk->locked = 0; + lk->cpu = 0; +} + +// Acquire the lock. +// Loops (spins) until the lock is acquired. +void +acquire(struct spinlock *lk) +{ + push_off(); // disable interrupts to avoid deadlock. + if(holding(lk)) + panic("acquire"); + + // On RISC-V, sync_lock_test_and_set turns into an atomic swap: + // a5 = 1 + // s1 = &lk->locked + // amoswap.w.aq a5, a5, (s1) + while(__sync_lock_test_and_set(&lk->locked, 1) != 0) + ; + + // Tell the C compiler and the processor to not move loads or stores + // past this point, to ensure that the critical section's memory + // references happen after the lock is acquired. + __sync_synchronize(); + + // Record info about lock acquisition for holding() and debugging. + lk->cpu = mycpu(); +} + +// Release the lock. +void +release(struct spinlock *lk) +{ + if(!holding(lk)) + panic("release"); + + lk->cpu = 0; + + // Tell the C compiler and the CPU to not move loads or stores + // past this point, to ensure that all the stores in the critical + // section are visible to other CPUs before the lock is released. + // On RISC-V, this turns into a fence instruction. + __sync_synchronize(); + + // Release the lock, equivalent to lk->locked = 0. + // This code doesn't use a C assignment, since the C standard + // implies that an assignment might be implemented with + // multiple store instructions. + // On RISC-V, sync_lock_release turns into an atomic swap: + // s1 = &lk->locked + // amoswap.w zero, zero, (s1) + __sync_lock_release(&lk->locked); + + pop_off(); +} + +// Check whether this cpu is holding the lock. +int +holding(struct spinlock *lk) +{ + int r; + push_off(); + r = (lk->locked && lk->cpu == mycpu()); + pop_off(); + return r; +} + +// push_off/pop_off are like intr_off()/intr_on() except that they are matched: +// it takes two pop_off()s to undo two push_off()s. Also, if interrupts +// are initially off, then push_off, pop_off leaves them off. + +void +push_off(void) +{ + int old = intr_get(); + + intr_off(); + if(mycpu()->noff == 0) + mycpu()->intena = old; + mycpu()->noff += 1; +} + +void +pop_off(void) +{ + struct cpu *c = mycpu(); + if(intr_get()) + panic("pop_off - interruptible"); + c->noff -= 1; + if(c->noff < 0) + panic("pop_off"); + if(c->noff == 0 && c->intena) + intr_on(); +} diff --git a/kernel/spinlock.h b/kernel/spinlock.h new file mode 100644 index 0000000..4392820 --- /dev/null +++ b/kernel/spinlock.h @@ -0,0 +1,9 @@ +// Mutual exclusion lock. +struct spinlock { + uint locked; // Is the lock held? + + // For debugging: + char *name; // Name of lock. + struct cpu *cpu; // The cpu holding the lock. +}; + diff --git a/kernel/start.c b/kernel/start.c new file mode 100644 index 0000000..203c5e6 --- /dev/null +++ b/kernel/start.c @@ -0,0 +1,82 @@ +#include "types.h" +#include "param.h" +#include "memlayout.h" +#include "riscv.h" +#include "defs.h" + +void main(); +void timerinit(); + +// entry.S needs one stack per CPU. +__attribute__ ((aligned (16))) char stack0[4096 * NCPU]; + +// scratch area for timer interrupt, one per CPU. +uint64 mscratch0[NCPU * 32]; + +// assembly code in kernelvec.S for machine-mode timer interrupt. +extern void timervec(); + +// entry.S jumps here in machine mode on stack0. +void +start() +{ + // set M Previous Privilege mode to Supervisor, for mret. + unsigned long x = r_mstatus(); + x &= ~MSTATUS_MPP_MASK; + x |= MSTATUS_MPP_S; + w_mstatus(x); + + // set M Exception Program Counter to main, for mret. + // requires gcc -mcmodel=medany + w_mepc((uint64)main); + + // disable paging for now. + w_satp(0); + + // delegate all interrupts and exceptions to supervisor mode. + w_medeleg(0xffff); + w_mideleg(0xffff); + + // ask for clock interrupts. + timerinit(); + + // keep each CPU's hartid in its tp register, for cpuid(). + int id = r_mhartid(); + w_tp(id); + + // switch to supervisor mode and jump to main(). + asm volatile("mret"); +} + +// set up to receive timer interrupts in machine mode, +// which arrive at timervec in kernelvec.S, +// which turns them into software interrupts for +// devintr() in trap.c. +void +timerinit() +{ + // each CPU has a separate source of timer interrupts. + int id = r_mhartid(); + + // ask the CLINT for a timer interrupt. + int interval = 1000000; // cycles; about 1/10th second in qemu. + *(uint64*)CLINT_MTIMECMP(id) = *(uint64*)CLINT_MTIME + interval; + + // prepare information in scratch[] for timervec. + // scratch[0..3] : space for timervec to save registers. + // scratch[4] : address of CLINT MTIMECMP register. + // scratch[5] : desired interval (in cycles) between timer interrupts. + uint64 *scratch = &mscratch0[32 * id]; + scratch[4] = CLINT_MTIMECMP(id); + scratch[5] = interval; + w_mscratch((uint64)scratch); + + // set the machine-mode trap handler. + w_mtvec((uint64)timervec); + + // enable machine-mode interrupts. + w_mstatus(r_mstatus() | MSTATUS_MIE); + + // enable machine-mode timer interrupts. + w_mie(r_mie() | MIE_MTIE); +} diff --git a/kernel/stat.h b/kernel/stat.h new file mode 100644 index 0000000..19543af --- /dev/null +++ b/kernel/stat.h @@ -0,0 +1,11 @@ +#define T_DIR 1 // Directory +#define T_FILE 2 // File +#define T_DEVICE 3 // Device + +struct stat { + int dev; // File system's disk device + uint ino; // Inode number + short type; // Type of file + short nlink; // Number of links to file + uint64 size; // Size of file in bytes +}; diff --git a/kernel/string.c b/kernel/string.c new file mode 100644 index 0000000..d99e612 --- /dev/null +++ b/kernel/string.c @@ -0,0 +1,104 @@ +#include "types.h" + +void* +memset(void *dst, int c, uint n) +{ + char *cdst = (char *) dst; + int i; + for(i = 0; i < n; i++){ + cdst[i] = c; + } + return dst; +} + +int +memcmp(const void *v1, const void *v2, uint n) +{ + const uchar *s1, *s2; + + s1 = v1; + s2 = v2; + while(n-- > 0){ + if(*s1 != *s2) + return *s1 - *s2; + s1++, s2++; + } + + return 0; +} + +void* +memmove(void *dst, const void *src, uint n) +{ + const char *s; + char *d; + + s = src; + d = dst; + if(s < d && s + n > d){ + s += n; + d += n; + while(n-- > 0) + *--d = *--s; + } else + while(n-- > 0) + *d++ = *s++; + + return dst; +} + +// memcpy exists to placate GCC. Use memmove. +void* +memcpy(void *dst, const void *src, uint n) +{ + return memmove(dst, src, n); +} + +int +strncmp(const char *p, const char *q, uint n) +{ + while(n > 0 && *p && *p == *q) + n--, p++, q++; + if(n == 0) + return 0; + return (uchar)*p - (uchar)*q; +} + +char* +strncpy(char *s, const char *t, int n) +{ + char *os; + + os = s; + while(n-- > 0 && (*s++ = *t++) != 0) + ; + while(n-- > 0) + *s++ = 0; + return os; +} + +// Like strncpy but guaranteed to NUL-terminate. +char* +safestrcpy(char *s, const char *t, int n) +{ + char *os; + + os = s; + if(n <= 0) + return os; + while(--n > 0 && (*s++ = *t++) != 0) + ; + *s = 0; + return os; +} + +int +strlen(const char *s) +{ + int n; + + for(n = 0; s[n]; n++) + ; + return n; +} + diff --git a/kernel/swtch.S b/kernel/swtch.S new file mode 100644 index 0000000..17a8663 --- /dev/null +++ b/kernel/swtch.S @@ -0,0 +1,42 @@ +# Context switch +# +# void swtch(struct context *old, struct context *new); +# +# Save current registers in old. Load from new. + + +.globl swtch +swtch: + sd ra, 0(a0) + sd sp, 8(a0) + sd s0, 16(a0) + sd s1, 24(a0) + sd s2, 32(a0) + sd s3, 40(a0) + sd s4, 48(a0) + sd s5, 56(a0) + sd s6, 64(a0) + sd s7, 72(a0) + sd s8, 80(a0) + sd s9, 88(a0) + sd s10, 96(a0) + sd s11, 104(a0) + + ld ra, 0(a1) + ld sp, 8(a1) + ld s0, 16(a1) + ld s1, 24(a1) + ld s2, 32(a1) + ld s3, 40(a1) + ld s4, 48(a1) + ld s5, 56(a1) + ld s6, 64(a1) + ld s7, 72(a1) + ld s8, 80(a1) + ld s9, 88(a1) + ld s10, 96(a1) + ld s11, 104(a1) + + ret + + diff --git a/kernel/syscall.c b/kernel/syscall.c new file mode 100644 index 0000000..97974d6 --- /dev/null +++ b/kernel/syscall.c @@ -0,0 +1,147 @@ +#include "types.h" +#include "param.h" +#include "memlayout.h" +#include "riscv.h" +#include "spinlock.h" +#include "proc.h" +#include "syscall.h" +#include "defs.h" + +// Fetch the uint64 at addr from the current process. +int +fetchaddr(uint64 addr, uint64 *ip) +{ + struct proc *p = myproc(); + if(addr >= p->sz || addr+sizeof(uint64) > p->sz) + return -1; + if(copyin(p->pagetable, (char *)ip, addr, sizeof(*ip)) != 0) + return -1; + return 0; +} + +// Fetch the nul-terminated string at addr from the current process. +// Doesn't actually copy the string - just sets *pp to point at it. +// Returns length of string, not including nul, or -1 for error. +int +fetchstr(uint64 addr, char *buf, int max) +{ + struct proc *p = myproc(); + int err = copyinstr(p->pagetable, buf, addr, max); + if(err < 0) + return err; + return strlen(buf); +} + +static uint64 +argraw(int n) +{ + struct proc *p = myproc(); + switch (n) { + case 0: + return p->tf->a0; + case 1: + return p->tf->a1; + case 2: + return p->tf->a2; + case 3: + return p->tf->a3; + case 4: + return p->tf->a4; + case 5: + return p->tf->a5; + } + panic("argraw"); + return -1; +} + +// Fetch the nth 32-bit system call argument. +int +argint(int n, int *ip) +{ + *ip = argraw(n); + return 0; +} + +// Retrieve an argument as a pointer. +// Doesn't check for legality, since +// copyin/copyout will do that. +int +argaddr(int n, uint64 *ip) +{ + *ip = argraw(n); + return 0; +} + +// Fetch the nth word-sized system call argument as a null-terminated string. +// Copies into buf, at most max. +// Returns string length if OK (including nul), -1 if error. +int +argstr(int n, char *buf, int max) +{ + uint64 addr; + if(argaddr(n, &addr) < 0) + return -1; + return fetchstr(addr, buf, max); +} + +extern uint64 sys_chdir(void); +extern uint64 sys_close(void); +extern uint64 sys_dup(void); +extern uint64 sys_exec(void); +extern uint64 sys_exit(void); +extern uint64 sys_fork(void); +extern uint64 sys_fstat(void); +extern uint64 sys_getpid(void); +extern uint64 sys_kill(void); +extern uint64 sys_link(void); +extern uint64 sys_mkdir(void); +extern uint64 sys_mknod(void); +extern uint64 sys_open(void); +extern uint64 sys_pipe(void); +extern uint64 sys_read(void); +extern uint64 sys_sbrk(void); +extern uint64 sys_sleep(void); +extern uint64 sys_unlink(void); +extern uint64 sys_wait(void); +extern uint64 sys_write(void); +extern uint64 sys_uptime(void); + +static uint64 (*syscalls[])(void) = { +[SYS_fork] sys_fork, +[SYS_exit] sys_exit, +[SYS_wait] sys_wait, +[SYS_pipe] sys_pipe, +[SYS_read] sys_read, +[SYS_kill] sys_kill, +[SYS_exec] sys_exec, +[SYS_fstat] sys_fstat, +[SYS_chdir] sys_chdir, +[SYS_dup] sys_dup, +[SYS_getpid] sys_getpid, +[SYS_sbrk] sys_sbrk, +[SYS_sleep] sys_sleep, +[SYS_uptime] sys_uptime, +[SYS_open] sys_open, +[SYS_write] sys_write, +[SYS_mknod] sys_mknod, +[SYS_unlink] sys_unlink, +[SYS_link] sys_link, +[SYS_mkdir] sys_mkdir, +[SYS_close] sys_close, +}; + +void +syscall(void) +{ + int num; + struct proc *p = myproc(); + + num = p->tf->a7; + if(num > 0 && num < NELEM(syscalls) && syscalls[num]) { + p->tf->a0 = syscalls[num](); + } else { + printf("%d %s: unknown sys call %d\n", + p->pid, p->name, num); + p->tf->a0 = -1; + } +} diff --git a/kernel/syscall.h b/kernel/syscall.h new file mode 100644 index 0000000..bc5f356 --- /dev/null +++ b/kernel/syscall.h @@ -0,0 +1,22 @@ +// System call numbers +#define SYS_fork 1 +#define SYS_exit 2 +#define SYS_wait 3 +#define SYS_pipe 4 +#define SYS_read 5 +#define SYS_kill 6 +#define SYS_exec 7 +#define SYS_fstat 8 +#define SYS_chdir 9 +#define SYS_dup 10 +#define SYS_getpid 11 +#define SYS_sbrk 12 +#define SYS_sleep 13 +#define SYS_uptime 14 +#define SYS_open 15 +#define SYS_write 16 +#define SYS_mknod 17 +#define SYS_unlink 18 +#define SYS_link 19 +#define SYS_mkdir 20 +#define SYS_close 21 diff --git a/kernel/sysfile.c b/kernel/sysfile.c new file mode 100644 index 0000000..23a9540 --- /dev/null +++ b/kernel/sysfile.c @@ -0,0 +1,476 @@ +// +// File-system system calls. +// Mostly argument checking, since we don't trust +// user code, and calls into file.c and fs.c. +// + +#include "types.h" +#include "riscv.h" +#include "defs.h" +#include "param.h" +#include "stat.h" +#include "spinlock.h" +#include "proc.h" +#include "fs.h" +#include "sleeplock.h" +#include "file.h" +#include "fcntl.h" + +// Fetch the nth word-sized system call argument as a file descriptor +// and return both the descriptor and the corresponding struct file. +static int +argfd(int n, int *pfd, struct file **pf) +{ + int fd; + struct file *f; + + if(argint(n, &fd) < 0) + return -1; + if(fd < 0 || fd >= NOFILE || (f=myproc()->ofile[fd]) == 0) + return -1; + if(pfd) + *pfd = fd; + if(pf) + *pf = f; + return 0; +} + +// Allocate a file descriptor for the given file. +// Takes over file reference from caller on success. +static int +fdalloc(struct file *f) +{ + int fd; + struct proc *p = myproc(); + + for(fd = 0; fd < NOFILE; fd++){ + if(p->ofile[fd] == 0){ + p->ofile[fd] = f; + return fd; + } + } + return -1; +} + +uint64 +sys_dup(void) +{ + struct file *f; + int fd; + + if(argfd(0, 0, &f) < 0) + return -1; + if((fd=fdalloc(f)) < 0) + return -1; + filedup(f); + return fd; +} + +uint64 +sys_read(void) +{ + struct file *f; + int n; + uint64 p; + + if(argfd(0, 0, &f) < 0 || argint(2, &n) < 0 || argaddr(1, &p) < 0) + return -1; + return fileread(f, p, n); +} + +uint64 +sys_write(void) +{ + struct file *f; + int n; + uint64 p; + + if(argfd(0, 0, &f) < 0 || argint(2, &n) < 0 || argaddr(1, &p) < 0) + return -1; + + return filewrite(f, p, n); +} + +uint64 +sys_close(void) +{ + int fd; + struct file *f; + + if(argfd(0, &fd, &f) < 0) + return -1; + myproc()->ofile[fd] = 0; + fileclose(f); + return 0; +} + +uint64 +sys_fstat(void) +{ + struct file *f; + uint64 st; // user pointer to struct stat + + if(argfd(0, 0, &f) < 0 || argaddr(1, &st) < 0) + return -1; + return filestat(f, st); +} + +// Create the path new as a link to the same inode as old. +uint64 +sys_link(void) +{ + char name[DIRSIZ], new[MAXPATH], old[MAXPATH]; + struct inode *dp, *ip; + + if(argstr(0, old, MAXPATH) < 0 || argstr(1, new, MAXPATH) < 0) + return -1; + + begin_op(); + if((ip = namei(old)) == 0){ + end_op(); + return -1; + } + + ilock(ip); + if(ip->type == T_DIR){ + iunlockput(ip); + end_op(); + return -1; + } + + ip->nlink++; + iupdate(ip); + iunlock(ip); + + if((dp = nameiparent(new, name)) == 0) + goto bad; + ilock(dp); + if(dp->dev != ip->dev || dirlink(dp, name, ip->inum) < 0){ + iunlockput(dp); + goto bad; + } + iunlockput(dp); + iput(ip); + + end_op(); + + return 0; + +bad: + ilock(ip); + ip->nlink--; + iupdate(ip); + iunlockput(ip); + end_op(); + return -1; +} + +// Is the directory dp empty except for "." and ".." ? +static int +isdirempty(struct inode *dp) +{ + int off; + struct dirent de; + + for(off=2*sizeof(de); off<dp->size; off+=sizeof(de)){ + if(readi(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de)) + panic("isdirempty: readi"); + if(de.inum != 0) + return 0; + } + return 1; +} + +uint64 +sys_unlink(void) +{ + struct inode *ip, *dp; + struct dirent de; + char name[DIRSIZ], path[MAXPATH]; + uint off; + + if(argstr(0, path, MAXPATH) < 0) + return -1; + + begin_op(); + if((dp = nameiparent(path, name)) == 0){ + end_op(); + return -1; + } + + ilock(dp); + + // Cannot unlink "." or "..". + if(namecmp(name, ".") == 0 || namecmp(name, "..") == 0) + goto bad; + + if((ip = dirlookup(dp, name, &off)) == 0) + goto bad; + ilock(ip); + + if(ip->nlink < 1) + panic("unlink: nlink < 1"); + if(ip->type == T_DIR && !isdirempty(ip)){ + iunlockput(ip); + goto bad; + } + + memset(&de, 0, sizeof(de)); + if(writei(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de)) + panic("unlink: writei"); + if(ip->type == T_DIR){ + dp->nlink--; + iupdate(dp); + } + iunlockput(dp); + + ip->nlink--; + iupdate(ip); + iunlockput(ip); + + end_op(); + + return 0; + +bad: + iunlockput(dp); + end_op(); + return -1; +} + +static struct inode* +create(char *path, short type, short major, short minor) +{ + struct inode *ip, *dp; + char name[DIRSIZ]; + + if((dp = nameiparent(path, name)) == 0) + return 0; + ilock(dp); + + if((ip = dirlookup(dp, name, 0)) != 0){ + iunlockput(dp); + ilock(ip); + if(type == T_FILE && (ip->type == T_FILE || ip->type == T_DEVICE)) + return ip; + iunlockput(ip); + return 0; + } + + if((ip = ialloc(dp->dev, type)) == 0) + panic("create: ialloc"); + + ilock(ip); + ip->major = major; + ip->minor = minor; + ip->nlink = 1; + iupdate(ip); + + if(type == T_DIR){ // Create . and .. entries. + dp->nlink++; // for ".." + iupdate(dp); + // No ip->nlink++ for ".": avoid cyclic ref count. + if(dirlink(ip, ".", ip->inum) < 0 || dirlink(ip, "..", dp->inum) < 0) + panic("create dots"); + } + + if(dirlink(dp, name, ip->inum) < 0) + panic("create: dirlink"); + + iunlockput(dp); + + return ip; +} + +uint64 +sys_open(void) +{ + char path[MAXPATH]; + int fd, omode; + struct file *f; + struct inode *ip; + + if(argstr(0, path, MAXPATH) < 0 || argint(1, &omode) < 0) + return -1; + + begin_op(); + + if(omode & O_CREATE){ + ip = create(path, T_FILE, 0, 0); + if(ip == 0){ + end_op(); + return -1; + } + } else { + if((ip = namei(path)) == 0){ + end_op(); + return -1; + } + ilock(ip); + if(ip->type == T_DIR && omode != O_RDONLY){ + iunlockput(ip); + end_op(); + return -1; + } + } + + if(ip->type == T_DEVICE && (ip->major < 0 || ip->major >= NDEV)){ + iunlockput(ip); + end_op(); + return -1; + } + + if((f = filealloc()) == 0 || (fd = fdalloc(f)) < 0){ + if(f) + fileclose(f); + iunlockput(ip); + end_op(); + return -1; + } + + if(ip->type == T_DEVICE){ + f->type = FD_DEVICE; + f->major = ip->major; + } else { + f->type = FD_INODE; + f->off = 0; + } + f->ip = ip; + f->readable = !(omode & O_WRONLY); + f->writable = (omode & O_WRONLY) || (omode & O_RDWR); + + iunlock(ip); + end_op(); + + return fd; +} + +uint64 +sys_mkdir(void) +{ + char path[MAXPATH]; + struct inode *ip; + + begin_op(); + if(argstr(0, path, MAXPATH) < 0 || (ip = create(path, T_DIR, 0, 0)) == 0){ + end_op(); + return -1; + } + iunlockput(ip); + end_op(); + return 0; +} + +uint64 +sys_mknod(void) +{ + struct inode *ip; + char path[MAXPATH]; + int major, minor; + + begin_op(); + if((argstr(0, path, MAXPATH)) < 0 || + argint(1, &major) < 0 || + argint(2, &minor) < 0 || + (ip = create(path, T_DEVICE, major, minor)) == 0){ + end_op(); + return -1; + } + iunlockput(ip); + end_op(); + return 0; +} + +uint64 +sys_chdir(void) +{ + char path[MAXPATH]; + struct inode *ip; + struct proc *p = myproc(); + + begin_op(); + if(argstr(0, path, MAXPATH) < 0 || (ip = namei(path)) == 0){ + end_op(); + return -1; + } + ilock(ip); + if(ip->type != T_DIR){ + iunlockput(ip); + end_op(); + return -1; + } + iunlock(ip); + iput(p->cwd); + end_op(); + p->cwd = ip; + return 0; +} + +uint64 +sys_exec(void) +{ + char path[MAXPATH], *argv[MAXARG]; + int i; + uint64 uargv, uarg; + + if(argstr(0, path, MAXPATH) < 0 || argaddr(1, &uargv) < 0){ + return -1; + } + memset(argv, 0, sizeof(argv)); + for(i=0;; i++){ + if(i >= NELEM(argv)){ + return -1; + } + if(fetchaddr(uargv+sizeof(uint64)*i, (uint64*)&uarg) < 0){ + return -1; + } + if(uarg == 0){ + argv[i] = 0; + break; + } + argv[i] = kalloc(); + if(argv[i] == 0) + panic("sys_exec kalloc"); + if(fetchstr(uarg, argv[i], PGSIZE) < 0){ + return -1; + } + } + + int ret = exec(path, argv); + + for(i = 0; i < NELEM(argv) && argv[i] != 0; i++) + kfree(argv[i]); + + return ret; +} + +uint64 +sys_pipe(void) +{ + uint64 fdarray; // user pointer to array of two integers + struct file *rf, *wf; + int fd0, fd1; + struct proc *p = myproc(); + + if(argaddr(0, &fdarray) < 0) + return -1; + if(pipealloc(&rf, &wf) < 0) + return -1; + fd0 = -1; + if((fd0 = fdalloc(rf)) < 0 || (fd1 = fdalloc(wf)) < 0){ + if(fd0 >= 0) + p->ofile[fd0] = 0; + fileclose(rf); + fileclose(wf); + return -1; + } + if(copyout(p->pagetable, fdarray, (char*)&fd0, sizeof(fd0)) < 0 || + copyout(p->pagetable, fdarray+sizeof(fd0), (char *)&fd1, sizeof(fd1)) < 0){ + p->ofile[fd0] = 0; + p->ofile[fd1] = 0; + fileclose(rf); + fileclose(wf); + return -1; + } + return 0; +} diff --git a/kernel/sysproc.c b/kernel/sysproc.c new file mode 100644 index 0000000..face81a --- /dev/null +++ b/kernel/sysproc.c @@ -0,0 +1,91 @@ +#include "types.h" +#include "riscv.h" +#include "defs.h" +#include "date.h" +#include "param.h" +#include "memlayout.h" +#include "spinlock.h" +#include "proc.h" + +uint64 +sys_exit(void) +{ + exit(); + return 0; // not reached +} + +uint64 +sys_getpid(void) +{ + return myproc()->pid; +} + +uint64 +sys_fork(void) +{ + return fork(); +} + +uint64 +sys_wait(void) +{ + return wait(); +} + +uint64 +sys_sbrk(void) +{ + int addr; + int n; + + if(argint(0, &n) < 0) + return -1; + addr = myproc()->sz; + if(growproc(n) < 0) + return -1; + return addr; +} + +uint64 +sys_sleep(void) +{ + int n; + uint ticks0; + + if(argint(0, &n) < 0) + return -1; + acquire(&tickslock); + ticks0 = ticks; + while(ticks - ticks0 < n){ + if(myproc()->killed){ + release(&tickslock); + return -1; + } + sleep(&ticks, &tickslock); + } + release(&tickslock); + return 0; +} + +uint64 +sys_kill(void) +{ + int pid; + + if(argint(0, &pid) < 0) + return -1; + return kill(pid); +} + +// return how many clock tick interrupts have occurred +// since start. +uint64 +sys_uptime(void) +{ + uint xticks; + + acquire(&tickslock); + xticks = ticks; + release(&tickslock); + return xticks; +} diff --git a/kernel/trampoline.S b/kernel/trampoline.S new file mode 100644 index 0000000..24499d9 --- /dev/null +++ b/kernel/trampoline.S @@ -0,0 +1,141 @@ + # + # code to switch between user and kernel space. + # + # this code is mapped at the same virtual address + # (TRAMPOLINE) in user and kernel space so that + # it continues to work when it switches page tables. + # + # kernel.ld causes this to be aligned + # to a page boundary. + # + .section trampsec +.globl trampoline +trampoline: +.align 4 +.globl uservec +uservec: + # + # trap.c sets stvec to point here, so + # traps from user space start here, + # in supervisor mode, but with a + # user page table. + # + # sscratch points to where the process's p->tf is + # mapped into user space, at TRAPFRAME. + # + + # swap a0 and sscratch + # so that a0 is TRAPFRAME + csrrw a0, sscratch, a0 + + # save the user registers in TRAPFRAME + sd ra, 40(a0) + sd sp, 48(a0) + sd gp, 56(a0) + sd tp, 64(a0) + sd t0, 72(a0) + sd t1, 80(a0) + sd t2, 88(a0) + sd s0, 96(a0) + sd s1, 104(a0) + sd a1, 120(a0) + sd a2, 128(a0) + sd a3, 136(a0) + sd a4, 144(a0) + sd a5, 152(a0) + sd a6, 160(a0) + sd a7, 168(a0) + sd s2, 176(a0) + sd s3, 184(a0) + sd s4, 192(a0) + sd s5, 200(a0) + sd s6, 208(a0) + sd s7, 216(a0) + sd s8, 224(a0) + sd s9, 232(a0) + sd s10, 240(a0) + sd s11, 248(a0) + sd t3, 256(a0) + sd t4, 264(a0) + sd t5, 272(a0) + sd t6, 280(a0) + + # save the user a0 in p->tf->a0 + csrr t0, sscratch + sd t0, 112(a0) + + # restore kernel stack pointer from p->tf->kernel_sp + ld sp, 8(a0) + + # make tp hold the current hartid, from p->tf->kernel_hartid + ld tp, 32(a0) + + # remember the address of usertrap(), p->tf->kernel_trap + ld t0, 16(a0) + + # restore kernel page table from p->tf->kernel_satp + ld t1, 0(a0) + sfence.vma zero, zero + csrw satp, t1 + + # a0 is no longer valid, since the kernel page + # table does not specially map p->td. + + # jump to usertrap(), which does not return + jr t0 + +.globl userret +userret: + # userret(TRAPFRAME, pagetable) + # switch from kernel to user. + # usertrapret() calls here. + # a0: TRAPFRAME, in user page table + # a1: user page table, for satp + + # switch to the user page table. + sfence.vma zero, zero + csrw satp, a1 + + # put the saved user a0 in sscratch, so we + # can swap it with our a0 (TRAPFRAME) in the last step. + ld t0, 112(a0) + csrw sscratch, t0 + + # restore all but a0 from TRAPFRAME + ld ra, 40(a0) + ld sp, 48(a0) + ld gp, 56(a0) + ld tp, 64(a0) + ld t0, 72(a0) + ld t1, 80(a0) + ld t2, 88(a0) + ld s0, 96(a0) + ld s1, 104(a0) + ld a1, 120(a0) + ld a2, 128(a0) + ld a3, 136(a0) + ld a4, 144(a0) + ld a5, 152(a0) + ld a6, 160(a0) + ld a7, 168(a0) + ld s2, 176(a0) + ld s3, 184(a0) + ld s4, 192(a0) + ld s5, 200(a0) + ld s6, 208(a0) + ld s7, 216(a0) + ld s8, 224(a0) + ld s9, 232(a0) + ld s10, 240(a0) + ld s11, 248(a0) + ld t3, 256(a0) + ld t4, 264(a0) + ld t5, 272(a0) + ld t6, 280(a0) + + # restore user a0, and save TRAPFRAME in sscratch + csrrw a0, sscratch, a0 + + # return to user mode and user pc. + # usertrapret() set up sstatus and sepc. + sret diff --git a/kernel/trap.c b/kernel/trap.c new file mode 100644 index 0000000..ec57bed --- /dev/null +++ b/kernel/trap.c @@ -0,0 +1,213 @@ +#include "types.h" +#include "param.h" +#include "memlayout.h" +#include "riscv.h" +#include "spinlock.h" +#include "proc.h" +#include "defs.h" + +struct spinlock tickslock; +uint ticks; + +extern char trampoline[], uservec[], userret[]; + +// in kernelvec.S, calls kerneltrap(). +void kernelvec(); + +extern int devintr(); + +void +trapinit(void) +{ + initlock(&tickslock, "time"); +} + +// set up to take exceptions and traps while in the kernel. +void +trapinithart(void) +{ + w_stvec((uint64)kernelvec); +} + +// +// handle an interrupt, exception, or system call from user space. +// called from trampoline.S +// +void +usertrap(void) +{ + int which_dev = 0; + + if((r_sstatus() & SSTATUS_SPP) != 0) + panic("usertrap: not from user mode"); + + // send interrupts and exceptions to kerneltrap(), + // since we're now in the kernel. + w_stvec((uint64)kernelvec); + + struct proc *p = myproc(); + + // save user program counter. + p->tf->epc = r_sepc(); + + if(r_scause() == 8){ + // system call + + if(p->killed) + exit(); + + // sepc points to the ecall instruction, + // but we want to return to the next instruction. + p->tf->epc += 4; + + // an interrupt will change sstatus &c registers, + // so don't enable until done with those registers. + intr_on(); + + syscall(); + } else if((which_dev = devintr()) != 0){ + // ok + } else { + printf("usertrap(): unexpected scause %p pid=%d\n", r_scause(), p->pid); + printf(" sepc=%p stval=%p\n", r_sepc(), r_stval()); + p->killed = 1; + } + + if(p->killed) + exit(); + + // give up the CPU if this is a timer interrupt. + if(which_dev == 2) + yield(); + + usertrapret(); +} + +// +// return to user space +// +void +usertrapret(void) +{ + struct proc *p = myproc(); + + // turn off interrupts, since we're switching + // now from kerneltrap() to usertrap(). + intr_off(); + + // send interrupts and exceptions to trampoline.S + w_stvec(TRAMPOLINE + (uservec - trampoline)); + + // set up values that uservec will need when + // the process next re-enters the kernel. + p->tf->kernel_satp = r_satp(); // kernel page table + p->tf->kernel_sp = p->kstack + PGSIZE; // process's kernel stack + p->tf->kernel_trap = (uint64)usertrap; + p->tf->kernel_hartid = r_tp(); // hartid for cpuid() + + // set up the registers that trampoline.S's sret will use + // to get to user space. + + // set S Previous Privilege mode to User. + unsigned long x = r_sstatus(); + x &= ~SSTATUS_SPP; // clear SPP to 0 for user mode + x |= SSTATUS_SPIE; // enable interrupts in user mode + w_sstatus(x); + + // set S Exception Program Counter to the saved user pc. + w_sepc(p->tf->epc); + + // tell trampoline.S the user page table to switch to. + uint64 satp = MAKE_SATP(p->pagetable); + + // jump to trampoline.S at the top of memory, which + // switches to the user page table, restores user registers, + // and switches to user mode with sret. + uint64 fn = TRAMPOLINE + (userret - trampoline); + ((void (*)(uint64,uint64))fn)(TRAPFRAME, satp); +} + +// interrupts and exceptions from kernel code go here via kernelvec, +// on whatever the current kernel stack is. +// must be 4-byte aligned to fit in stvec. +void +kerneltrap() +{ + int which_dev = 0; + uint64 sepc = r_sepc(); + uint64 sstatus = r_sstatus(); + uint64 scause = r_scause(); + + if((sstatus & SSTATUS_SPP) == 0) + panic("kerneltrap: not from supervisor mode"); + if(intr_get() != 0) + panic("kerneltrap: interrupts enabled"); + + if((which_dev = devintr()) == 0){ + printf("scause %p\n", scause); + printf("sepc=%p stval=%p\n", r_sepc(), r_stval()); + panic("kerneltrap"); + } + + // give up the CPU if this is a timer interrupt. + if(which_dev == 2 && myproc() != 0 && myproc()->state == RUNNING) + yield(); + + // the yield() may have caused some traps to occur, + // so restore trap registers for use by kernelvec.S's sepc instruction. + w_sepc(sepc); + w_sstatus(sstatus); +} + +void +clockintr() +{ + acquire(&tickslock); + ticks++; + wakeup(&ticks); + release(&tickslock); +} + +// check if it's an external interrupt or software interrupt, +// and handle it. +// returns 2 if timer interrupt, +// 1 if other device, +// 0 if not recognized. +int +devintr() +{ + uint64 scause = r_scause(); + + if((scause & 0x8000000000000000L) && + (scause & 0xff) == 9){ + // this is a supervisor external interrupt, via PLIC. + + // irq indicates which device interrupted. + int irq = plic_claim(); + + if(irq == UART0_IRQ){ + uartintr(); + } else if(irq == VIRTIO0_IRQ){ + virtio_disk_intr(); + } + + plic_complete(irq); + return 1; + } else if(scause == 0x8000000000000001L){ + // software interrupt from a machine-mode timer interrupt, + // forwarded by timervec in kernelvec.S. + + if(cpuid() == 0){ + clockintr(); + } + + // acknowledge the software interrupt by clearing + // the SSIP bit in sip. + w_sip(r_sip() & ~2); + + return 2; + } else { + return 0; + } +} + diff --git a/kernel/types.h b/kernel/types.h new file mode 100644 index 0000000..ee73164 --- /dev/null +++ b/kernel/types.h @@ -0,0 +1,10 @@ +typedef unsigned int uint; +typedef unsigned short ushort; +typedef unsigned char uchar; + +typedef unsigned char uint8; +typedef unsigned short uint16; +typedef unsigned int uint32; +typedef unsigned long uint64; + +typedef uint64 pde_t; diff --git a/kernel/uart.c b/kernel/uart.c new file mode 100644 index 0000000..3a5cdc4 --- /dev/null +++ b/kernel/uart.c @@ -0,0 +1,92 @@ +// +// low-level driver routines for 16550a UART. +// + +#include "types.h" +#include "param.h" +#include "memlayout.h" +#include "riscv.h" +#include "spinlock.h" +#include "proc.h" +#include "defs.h" + +// the UART control registers are memory-mapped +// at address UART0. this macro returns the +// address of one of the registers. +#define Reg(reg) ((volatile unsigned char *)(UART0 + reg)) + +// the UART control registers. +// some have different meanings for +// read vs write. +// http://byterunner.com/16550.html +#define RHR 0 // receive holding register (for input bytes) +#define THR 0 // transmit holding register (for output bytes) +#define IER 1 // interrupt enable register +#define FCR 2 // FIFO control register +#define ISR 2 // interrupt status register +#define LCR 3 // line control register +#define LSR 5 // line status register + +#define ReadReg(reg) (*(Reg(reg))) +#define WriteReg(reg, v) (*(Reg(reg)) = (v)) + +void +uartinit(void) +{ + // disable interrupts. + WriteReg(IER, 0x00); + + // special mode to set baud rate. + WriteReg(LCR, 0x80); + + // LSB for baud rate of 38.4K. + WriteReg(0, 0x03); + + // MSB for baud rate of 38.4K. + WriteReg(1, 0x00); + + // leave set-baud mode, + // and set word length to 8 bits, no parity. + WriteReg(LCR, 0x03); + + // reset and enable FIFOs. + WriteReg(FCR, 0x07); + + // enable receive interrupts. + WriteReg(IER, 0x01); +} + +// write one output character to the UART. +void +uartputc(int c) +{ + // wait for Transmit Holding Empty to be set in LSR. + while((ReadReg(LSR) & (1 << 5)) == 0) + ; + WriteReg(THR, c); +} + +// read one input character from the UART. +// return -1 if none is waiting. +int +uartgetc(void) +{ + if(ReadReg(LSR) & 0x01){ + // input data is ready. + return ReadReg(RHR); + } else { + return -1; + } +} + +// trap.c calls here when the uart interrupts. +void +uartintr(void) +{ + while(1){ + int c = uartgetc(); + if(c == -1) + break; + consoleintr(c); + } +} diff --git a/kernel/virtio.h b/kernel/virtio.h new file mode 100644 index 0000000..03b53a9 --- /dev/null +++ b/kernel/virtio.h @@ -0,0 +1,72 @@ +// +// virtio device definitions. +// for both the mmio interface, and virtio descriptors. +// only tested with qemu. +// this is the "legacy" virtio interface. +// +// the virtio spec: +// https://docs.oasis-open.org/virtio/virtio/v1.1/virtio-v1.1.pdf +// + +// virtio mmio control registers, mapped starting at 0x10001000. +// from qemu virtio_mmio.h +#define VIRTIO_MMIO_MAGIC_VALUE 0x000 // 0x74726976 +#define VIRTIO_MMIO_VERSION 0x004 // version; 1 is legacy +#define VIRTIO_MMIO_DEVICE_ID 0x008 // device type; 1 is net, 2 is disk +#define VIRTIO_MMIO_VENDOR_ID 0x00c // 0x554d4551 +#define VIRTIO_MMIO_DEVICE_FEATURES 0x010 +#define VIRTIO_MMIO_DRIVER_FEATURES 0x020 +#define VIRTIO_MMIO_GUEST_PAGE_SIZE 0x028 // page size for PFN, write-only +#define VIRTIO_MMIO_QUEUE_SEL 0x030 // select queue, write-only +#define VIRTIO_MMIO_QUEUE_NUM_MAX 0x034 // max size of current queue, read-only +#define VIRTIO_MMIO_QUEUE_NUM 0x038 // size of current queue, write-only +#define VIRTIO_MMIO_QUEUE_ALIGN 0x03c // used ring alignment, write-only +#define VIRTIO_MMIO_QUEUE_PFN 0x040 // physical page number for queue, read/write +#define VIRTIO_MMIO_QUEUE_READY 0x044 // ready bit +#define VIRTIO_MMIO_QUEUE_NOTIFY 0x050 // write-only +#define VIRTIO_MMIO_INTERRUPT_STATUS 0x060 // read-only +#define VIRTIO_MMIO_INTERRUPT_ACK 0x064 // write-only +#define VIRTIO_MMIO_STATUS 0x070 // read/write + +// status register bits, from qemu virtio_config.h +#define VIRTIO_CONFIG_S_ACKNOWLEDGE 1 +#define VIRTIO_CONFIG_S_DRIVER 2 +#define VIRTIO_CONFIG_S_DRIVER_OK 4 +#define VIRTIO_CONFIG_S_FEATURES_OK 8 + +// device feature bits +#define VIRTIO_BLK_F_RO 5 /* Disk is read-only */ +#define VIRTIO_BLK_F_SCSI 7 /* Supports scsi command passthru */ +#define VIRTIO_BLK_F_CONFIG_WCE 11 /* Writeback mode available in config */ +#define VIRTIO_BLK_F_MQ 12 /* support more than one vq */ +#define VIRTIO_F_ANY_LAYOUT 27 +#define VIRTIO_RING_F_INDIRECT_DESC 28 +#define VIRTIO_RING_F_EVENT_IDX 29 + +// this many virtio descriptors. +// must be a power of two. +#define NUM 8 + +struct VRingDesc { + uint64 addr; + uint32 len; + uint16 flags; + uint16 next; +}; +#define VRING_DESC_F_NEXT 1 // chained with another descriptor +#define VRING_DESC_F_WRITE 2 // device writes (vs read) + +struct VRingUsedElem { + uint32 id; // index of start of completed descriptor chain + uint32 len; +}; + +// for disk ops +#define VIRTIO_BLK_T_IN 0 // read the disk +#define VIRTIO_BLK_T_OUT 1 // write the disk + +struct UsedArea { + uint16 flags; + uint16 id; + struct VRingUsedElem elems[NUM]; +}; diff --git a/kernel/virtio_disk.c b/kernel/virtio_disk.c new file mode 100644 index 0000000..3cff024 --- /dev/null +++ b/kernel/virtio_disk.c @@ -0,0 +1,269 @@ +// +// driver for qemu's virtio disk device. +// uses qemu's mmio interface to virtio. +// qemu presents a "legacy" virtio interface. +// +// qemu ... -drive file=fs.img,if=none,format=raw,id=x0 -device virtio-blk-device,drive=x0,bus=virtio-mmio-bus.0 +// + +#include "types.h" +#include "riscv.h" +#include "defs.h" +#include "param.h" +#include "memlayout.h" +#include "spinlock.h" +#include "sleeplock.h" +#include "fs.h" +#include "buf.h" +#include "virtio.h" + +// the address of virtio mmio register r. +#define R(r) ((volatile uint32 *)(VIRTIO0 + (r))) + +static struct disk { + // memory for virtio descriptors &c for queue 0. + // this is a global instead of allocated because it must + // be multiple contiguous pages, which kalloc() + // doesn't support, and page aligned. + char pages[2*PGSIZE]; + struct VRingDesc *desc; + uint16 *avail; + struct UsedArea *used; + + // our own book-keeping. + char free[NUM]; // is a descriptor free? + uint16 used_idx; // we've looked this far in used[2..NUM]. + + // track info about in-flight operations, + // for use when completion interrupt arrives. + // indexed by first descriptor index of chain. + struct { + struct buf *b; + char status; + } info[NUM]; + + struct spinlock vdisk_lock; + +} __attribute__ ((aligned (PGSIZE))) disk; + +void +virtio_disk_init(void) +{ + uint32 status = 0; + + initlock(&disk.vdisk_lock, "virtio_disk"); + + if(*R(VIRTIO_MMIO_MAGIC_VALUE) != 0x74726976 || + *R(VIRTIO_MMIO_VERSION) != 1 || + *R(VIRTIO_MMIO_DEVICE_ID) != 2 || + *R(VIRTIO_MMIO_VENDOR_ID) != 0x554d4551){ + panic("could not find virtio disk"); + } + + status |= VIRTIO_CONFIG_S_ACKNOWLEDGE; + *R(VIRTIO_MMIO_STATUS) = status; + + status |= VIRTIO_CONFIG_S_DRIVER; + *R(VIRTIO_MMIO_STATUS) = status; + + // negotiate features + uint64 features = *R(VIRTIO_MMIO_DEVICE_FEATURES); + features &= ~(1 << VIRTIO_BLK_F_RO); + features &= ~(1 << VIRTIO_BLK_F_SCSI); + features &= ~(1 << VIRTIO_BLK_F_CONFIG_WCE); + features &= ~(1 << VIRTIO_BLK_F_MQ); + features &= ~(1 << VIRTIO_F_ANY_LAYOUT); + features &= ~(1 << VIRTIO_RING_F_EVENT_IDX); + features &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC); + *R(VIRTIO_MMIO_DRIVER_FEATURES) = features; + + // tell device that feature negotiation is complete. + status |= VIRTIO_CONFIG_S_FEATURES_OK; + *R(VIRTIO_MMIO_STATUS) = status; + + // tell device we're completely ready. + status |= VIRTIO_CONFIG_S_DRIVER_OK; + *R(VIRTIO_MMIO_STATUS) = status; + + *R(VIRTIO_MMIO_GUEST_PAGE_SIZE) = PGSIZE; + + // initialize queue 0. + *R(VIRTIO_MMIO_QUEUE_SEL) = 0; + uint32 max = *R(VIRTIO_MMIO_QUEUE_NUM_MAX); + if(max == 0) + panic("virtio disk has no queue 0"); + if(max < NUM) + panic("virtio disk max queue too short"); + *R(VIRTIO_MMIO_QUEUE_NUM) = NUM; + memset(disk.pages, 0, sizeof(disk.pages)); + *R(VIRTIO_MMIO_QUEUE_PFN) = ((uint64)disk.pages) >> PGSHIFT; + + // desc = pages -- num * VRingDesc + // avail = pages + 0x40 -- 2 * uint16, then num * uint16 + // used = pages + 4096 -- 2 * uint16, then num * vRingUsedElem + + disk.desc = (struct VRingDesc *) disk.pages; + disk.avail = (uint16*)(((char*)disk.desc) + NUM*sizeof(struct VRingDesc)); + disk.used = (struct UsedArea *) (disk.pages + PGSIZE); + + for(int i = 0; i < NUM; i++) + disk.free[i] = 1; + + // plic.c and trap.c arrange for interrupts from VIRTIO0_IRQ. +} + +// find a free descriptor, mark it non-free, return its index. +static int +alloc_desc() +{ + for(int i = 0; i < NUM; i++){ + if(disk.free[i]){ + disk.free[i] = 0; + return i; + } + } + return -1; +} + +// mark a descriptor as free. +static void +free_desc(int i) +{ + if(i >= NUM) + panic("virtio_disk_intr 1"); + if(disk.free[i]) + panic("virtio_disk_intr 2"); + disk.desc[i].addr = 0; + disk.free[i] = 1; + wakeup(&disk.free[0]); +} + +// free a chain of descriptors. +static void +free_chain(int i) +{ + while(1){ + free_desc(i); + if(disk.desc[i].flags & VRING_DESC_F_NEXT) + i = disk.desc[i].next; + else + break; + } +} + +static int +alloc3_desc(int *idx) +{ + for(int i = 0; i < 3; i++){ + idx[i] = alloc_desc(); + if(idx[i] < 0){ + for(int j = 0; j < i; j++) + free_desc(idx[j]); + return -1; + } + } + return 0; +} + +void +virtio_disk_rw(struct buf *b, int write) +{ + uint64 sector = b->blockno * (BSIZE / 512); + + acquire(&disk.vdisk_lock); + + // the spec says that legacy block operations use three + // descriptors: one for type/reserved/sector, one for + // the data, one for a 1-byte status result. + + // allocate the three descriptors. + int idx[3]; + while(1){ + if(alloc3_desc(idx) == 0) { + break; + } + sleep(&disk.free[0], &disk.vdisk_lock); + } + + // format the three descriptors. + // qemu's virtio-blk.c reads them. + + struct virtio_blk_outhdr { + uint32 type; + uint32 reserved; + uint64 sector; + } buf0; + + if(write) + buf0.type = VIRTIO_BLK_T_OUT; // write the disk + else + buf0.type = VIRTIO_BLK_T_IN; // read the disk + buf0.reserved = 0; + buf0.sector = sector; + + // buf0 is on a kernel stack, which is not direct mapped, + // thus the call to kvmpa(). + disk.desc[idx[0]].addr = (uint64) kvmpa((uint64) &buf0); + disk.desc[idx[0]].len = sizeof(buf0); + disk.desc[idx[0]].flags = VRING_DESC_F_NEXT; + disk.desc[idx[0]].next = idx[1]; + + disk.desc[idx[1]].addr = (uint64) b->data; + disk.desc[idx[1]].len = BSIZE; + if(write) + disk.desc[idx[1]].flags = 0; // device reads b->data + else + disk.desc[idx[1]].flags = VRING_DESC_F_WRITE; // device writes b->data + disk.desc[idx[1]].flags |= VRING_DESC_F_NEXT; + disk.desc[idx[1]].next = idx[2]; + + disk.info[idx[0]].status = 0; + disk.desc[idx[2]].addr = (uint64) &disk.info[idx[0]].status; + disk.desc[idx[2]].len = 1; + disk.desc[idx[2]].flags = VRING_DESC_F_WRITE; // device writes the status + disk.desc[idx[2]].next = 0; + + // record struct buf for virtio_disk_intr(). + b->disk = 1; + disk.info[idx[0]].b = b; + + // avail[0] is flags + // avail[1] tells the device how far to look in avail[2...]. + // avail[2...] are desc[] indices the device should process. + // we only tell device the first index in our chain of descriptors. + disk.avail[2 + (disk.avail[1] % NUM)] = idx[0]; + __sync_synchronize(); + disk.avail[1] = disk.avail[1] + 1; + + *R(VIRTIO_MMIO_QUEUE_NOTIFY) = 0; // value is queue number + + // Wait for virtio_disk_intr() to say request has finished. + while(b->disk == 1) { + sleep(b, &disk.vdisk_lock); + } + + disk.info[idx[0]].b = 0; + free_chain(idx[0]); + + release(&disk.vdisk_lock); +} + +void +virtio_disk_intr() +{ + acquire(&disk.vdisk_lock); + + while((disk.used_idx % NUM) != (disk.used->id % NUM)){ + int id = disk.used->elems[disk.used_idx].id; + + if(disk.info[id].status != 0) + panic("virtio_disk_intr status"); + + disk.info[id].b->disk = 0; // disk is done with buf + wakeup(disk.info[id].b); + + disk.used_idx = (disk.used_idx + 1) % NUM; + } + + release(&disk.vdisk_lock); +} diff --git a/kernel/vm.c b/kernel/vm.c new file mode 100644 index 0000000..3631c9c --- /dev/null +++ b/kernel/vm.c @@ -0,0 +1,441 @@ +#include "param.h" +#include "types.h" +#include "memlayout.h" +#include "elf.h" +#include "riscv.h" +#include "defs.h" +#include "fs.h" + +/* + * the kernel's page table. + */ +pagetable_t kernel_pagetable; + +extern char etext[]; // kernel.ld sets this to end of kernel code. + +extern char trampoline[]; // trampoline.S + +/* + * create a direct-map page table for the kernel and + * turn on paging. called early, in supervisor mode. + * the page allocator is already initialized. + */ +void +kvminit() +{ + kernel_pagetable = (pagetable_t) kalloc(); + memset(kernel_pagetable, 0, PGSIZE); + + // uart registers + kvmmap(UART0, UART0, PGSIZE, PTE_R | PTE_W); + + // virtio mmio disk interface + kvmmap(VIRTIO0, VIRTIO0, PGSIZE, PTE_R | PTE_W); + + // CLINT + kvmmap(CLINT, CLINT, 0x10000, PTE_R | PTE_W); + + // PLIC + kvmmap(PLIC, PLIC, 0x400000, PTE_R | PTE_W); + + // map kernel text executable and read-only. + kvmmap(KERNBASE, KERNBASE, (uint64)etext-KERNBASE, PTE_R | PTE_X); + + // map kernel data and the physical RAM we'll make use of. + kvmmap((uint64)etext, (uint64)etext, PHYSTOP-(uint64)etext, PTE_R | PTE_W); + + // map the trampoline for trap entry/exit to + // the highest virtual address in the kernel. + kvmmap(TRAMPOLINE, (uint64)trampoline, PGSIZE, PTE_R | PTE_X); +} + +// Switch h/w page table register to the kernel's page table, +// and enable paging. +void +kvminithart() +{ + sfence_vma(); + w_satp(MAKE_SATP(kernel_pagetable)); +} + +// Return the address of the PTE in page table pagetable +// that corresponds to virtual address va. If alloc!=0, +// create any required page-table pages. +// +// The risc-v Sv39 scheme has three levels of page-table +// pages. A page-table page contains 512 64-bit PTEs. +// A 64-bit virtual address is split into five fields: +// 39..63 -- must be zero. +// 30..38 -- 9 bits of level-2 index. +// 21..39 -- 9 bits of level-1 index. +// 12..20 -- 9 bits of level-0 index. +// 0..12 -- 12 bits of byte offset within the page. +static pte_t * +walk(pagetable_t pagetable, uint64 va, int alloc) +{ + if(va >= MAXVA) + panic("walk"); + + for(int level = 2; level > 0; level--) { + pte_t *pte = &pagetable[PX(level, va)]; + if(*pte & PTE_V) { + pagetable = (pagetable_t)PTE2PA(*pte); + } else { + if(!alloc || (pagetable = (pde_t*)kalloc()) == 0) + return 0; + memset(pagetable, 0, PGSIZE); + *pte = PA2PTE(pagetable) | PTE_V; + } + } + return &pagetable[PX(0, va)]; +} + +// Look up a virtual address, return the physical address, +// or 0 if not mapped. +// Can only be used to look up user pages. +uint64 +walkaddr(pagetable_t pagetable, uint64 va) +{ + pte_t *pte; + uint64 pa; + + pte = walk(pagetable, va, 0); + if(pte == 0) + return 0; + if((*pte & PTE_V) == 0) + return 0; + if((*pte & PTE_U) == 0) + return 0; + pa = PTE2PA(*pte); + return pa; +} + +// add a mapping to the kernel page table. +// only used when booting. +// does not flush TLB or enable paging. +void +kvmmap(uint64 va, uint64 pa, uint64 sz, int perm) +{ + if(mappages(kernel_pagetable, va, sz, pa, perm) != 0) + panic("kvmmap"); +} + +// translate a kernel virtual address to +// a physical address. only needed for +// addresses on the stack. +// assumes va is page aligned. +uint64 +kvmpa(uint64 va) +{ + uint64 off = va % PGSIZE; + pte_t *pte; + uint64 pa; + + pte = walk(kernel_pagetable, va, 0); + if(pte == 0) + panic("kvmpa"); + if((*pte & PTE_V) == 0) + panic("kvmpa"); + pa = PTE2PA(*pte); + return pa+off; +} + +// Create PTEs for virtual addresses starting at va that refer to +// physical addresses starting at pa. va and size might not +// be page-aligned. Returns 0 on success, -1 if walk() couldn't +// allocate a needed page-table page. +int +mappages(pagetable_t pagetable, uint64 va, uint64 size, uint64 pa, int perm) +{ + uint64 a, last; + pte_t *pte; + + a = PGROUNDDOWN(va); + last = PGROUNDDOWN(va + size - 1); + for(;;){ + if((pte = walk(pagetable, a, 1)) == 0) + return -1; + if(*pte & PTE_V) + panic("remap"); + *pte = PA2PTE(pa) | perm | PTE_V; + if(a == last) + break; + a += PGSIZE; + pa += PGSIZE; + } + return 0; +} + +// Remove mappings from a page table. The mappings in +// the given range must exist. Optionally free the +// physical memory. +void +uvmunmap(pagetable_t pagetable, uint64 va, uint64 size, int do_free) +{ + uint64 a, last; + pte_t *pte; + uint64 pa; + + a = PGROUNDDOWN(va); + last = PGROUNDDOWN(va + size - 1); + for(;;){ + if((pte = walk(pagetable, a, 0)) == 0) + panic("uvmunmap: walk"); + if((*pte & PTE_V) == 0){ + printf("va=%p pte=%p\n", a, *pte); + panic("uvmunmap: not mapped"); + } + if(PTE_FLAGS(*pte) == PTE_V) + panic("uvmunmap: not a leaf"); + if(do_free){ + pa = PTE2PA(*pte); + kfree((void*)pa); + } + *pte = 0; + if(a == last) + break; + a += PGSIZE; + pa += PGSIZE; + } +} + +// create an empty user page table. +pagetable_t +uvmcreate() +{ + pagetable_t pagetable; + pagetable = (pagetable_t) kalloc(); + if(pagetable == 0) + panic("uvmcreate: out of memory"); + memset(pagetable, 0, PGSIZE); + return pagetable; +} + +// Load the user initcode into address 0 of pagetable, +// for the very first process. +// sz must be less than a page. +void +uvminit(pagetable_t pagetable, uchar *src, uint sz) +{ + char *mem; + + if(sz >= PGSIZE) + panic("inituvm: more than a page"); + mem = kalloc(); + memset(mem, 0, PGSIZE); + mappages(pagetable, 0, PGSIZE, (uint64)mem, PTE_W|PTE_R|PTE_X|PTE_U); + memmove(mem, src, sz); +} + +// Allocate PTEs and physical memory to grow process from oldsz to +// newsz, which need not be page aligned. Returns new size or 0 on error. +uint64 +uvmalloc(pagetable_t pagetable, uint64 oldsz, uint64 newsz) +{ + char *mem; + uint64 a; + + if(newsz < oldsz) + return oldsz; + + oldsz = PGROUNDUP(oldsz); + a = oldsz; + for(; a < newsz; a += PGSIZE){ + mem = kalloc(); + if(mem == 0){ + uvmdealloc(pagetable, a, oldsz); + return 0; + } + memset(mem, 0, PGSIZE); + if(mappages(pagetable, a, PGSIZE, (uint64)mem, PTE_W|PTE_X|PTE_R|PTE_U) != 0){ + kfree(mem); + uvmdealloc(pagetable, a, oldsz); + return 0; + } + } + return newsz; +} + +// Deallocate user pages to bring the process size from oldsz to +// newsz. oldsz and newsz need not be page-aligned, nor does newsz +// need to be less than oldsz. oldsz can be larger than the actual +// process size. Returns the new process size. +uint64 +uvmdealloc(pagetable_t pagetable, uint64 oldsz, uint64 newsz) +{ + if(newsz >= oldsz) + return oldsz; + uvmunmap(pagetable, newsz, oldsz - newsz, 1); + return newsz; +} + +// Recursively free page-table pages. +// All leaf mappings must already have been removed. +static void +freewalk(pagetable_t pagetable) +{ + // there are 2^9 = 512 PTEs in a page table. + for(int i = 0; i < 512; i++){ + pte_t pte = pagetable[i]; + if((pte & PTE_V) && (pte & (PTE_R|PTE_W|PTE_X)) == 0){ + // this PTE points to a lower-level page table. + uint64 child = PTE2PA(pte); + freewalk((pagetable_t)child); + pagetable[i] = 0; + } else if(pte & PTE_V){ + panic("freewalk: leaf"); + } + } + kfree((void*)pagetable); +} + +// Free user memory pages, +// then free page-table pages. +void +uvmfree(pagetable_t pagetable, uint64 sz) +{ + uvmunmap(pagetable, 0, sz, 1); + freewalk(pagetable); +} + +// Given a parent process's page table, copy +// its memory into a child's page table. +// Copies both the page table and the +// physical memory. +// returns 0 on success, -1 on failure. +// frees any allocated pages on failure. +int +uvmcopy(pagetable_t old, pagetable_t new, uint64 sz) +{ + pte_t *pte; + uint64 pa, i; + uint flags; + char *mem; + + for(i = 0; i < sz; i += PGSIZE){ + if((pte = walk(old, i, 0)) == 0) + panic("copyuvm: pte should exist"); + if((*pte & PTE_V) == 0) + panic("copyuvm: page not present"); + pa = PTE2PA(*pte); + flags = PTE_FLAGS(*pte); + if((mem = kalloc()) == 0) + goto err; + memmove(mem, (char*)pa, PGSIZE); + if(mappages(new, i, PGSIZE, (uint64)mem, flags) != 0){ + kfree(mem); + goto err; + } + } + return 0; + + err: + uvmunmap(new, 0, i, 1); + return -1; +} + +// mark a PTE invalid for user access. +// used by exec for the user stack guard page. +void +uvmclear(pagetable_t pagetable, uint64 va) +{ + pte_t *pte; + + pte = walk(pagetable, va, 0); + if(pte == 0) + panic("uvmclear"); + *pte &= ~PTE_U; +} + +// Copy from kernel to user. +// Copy len bytes from src to virtual address dstva in a given page table. +// Return 0 on success, -1 on error. +int +copyout(pagetable_t pagetable, uint64 dstva, char *src, uint64 len) +{ + uint64 n, va0, pa0; + + while(len > 0){ + va0 = (uint)PGROUNDDOWN(dstva); + pa0 = walkaddr(pagetable, va0); + if(pa0 == 0) + return -1; + n = PGSIZE - (dstva - va0); + if(n > len) + n = len; + memmove((void *)(pa0 + (dstva - va0)), src, n); + + len -= n; + src += n; + dstva = va0 + PGSIZE; + } + return 0; +} + +// Copy from user to kernel. +// Copy len bytes to dst from virtual address srcva in a given page table. +// Return 0 on success, -1 on error. +int +copyin(pagetable_t pagetable, char *dst, uint64 srcva, uint64 len) +{ + uint64 n, va0, pa0; + + while(len > 0){ + va0 = (uint)PGROUNDDOWN(srcva); + pa0 = walkaddr(pagetable, va0); + if(pa0 == 0) + return -1; + n = PGSIZE - (srcva - va0); + if(n > len) + n = len; + memmove(dst, (void *)(pa0 + (srcva - va0)), n); + + len -= n; + dst += n; + srcva = va0 + PGSIZE; + } + return 0; +} + +// Copy a null-terminated string from user to kernel. +// Copy bytes to dst from virtual address srcva in a given page table, +// until a '\0', or max. +// Return 0 on success, -1 on error. +int +copyinstr(pagetable_t pagetable, char *dst, uint64 srcva, uint64 max) +{ + uint64 n, va0, pa0; + int got_null = 0; + + while(got_null == 0 && max > 0){ + va0 = (uint)PGROUNDDOWN(srcva); + pa0 = walkaddr(pagetable, va0); + if(pa0 == 0) + return -1; + n = PGSIZE - (srcva - va0); + if(n > max) + n = max; + + char *p = (char *) (pa0 + (srcva - va0)); + while(n > 0){ + if(*p == '\0'){ + *dst = '\0'; + got_null = 1; + break; + } else { + *dst = *p; + } + --n; + --max; + p++; + dst++; + } + + srcva = va0 + PGSIZE; + } + if(got_null){ + return 0; + } else { + return -1; + } +} |