diff options
Diffstat (limited to 'kernel/fs.c')
-rw-r--r-- | kernel/fs.c | 681 |
1 files changed, 681 insertions, 0 deletions
diff --git a/kernel/fs.c b/kernel/fs.c new file mode 100644 index 0000000..ebe377a --- /dev/null +++ b/kernel/fs.c @@ -0,0 +1,681 @@ +// File system implementation. Five layers: +// + Blocks: allocator for raw disk blocks. +// + Log: crash recovery for multi-step updates. +// + Files: inode allocator, reading, writing, metadata. +// + Directories: inode with special contents (list of other inodes!) +// + Names: paths like /usr/rtm/xv6/fs.c for convenient naming. +// +// This file contains the low-level file system manipulation +// routines. The (higher-level) system call implementations +// are in sysfile.c. + +#include "types.h" +#include "riscv.h" +#include "defs.h" +#include "param.h" +#include "stat.h" +#include "proc.h" +#include "spinlock.h" +#include "sleeplock.h" +#include "fs.h" +#include "buf.h" +#include "file.h" + +#define min(a, b) ((a) < (b) ? (a) : (b)) +static void itrunc(struct inode*); +// there should be one superblock per disk device, but we run with +// only one device +struct superblock sb; + +// Read the super block. +void +readsb(int dev, struct superblock *sb) +{ + struct buf *bp; + + bp = bread(dev, 1); + memmove(sb, bp->data, sizeof(*sb)); + brelse(bp); +} + +// Zero a block. +static void +bzero(int dev, int bno) +{ + struct buf *bp; + + bp = bread(dev, bno); + memset(bp->data, 0, BSIZE); + log_write(bp); + brelse(bp); +} + +// Blocks. + +// Allocate a zeroed disk block. +static uint +balloc(uint dev) +{ + int b, bi, m; + struct buf *bp; + + bp = 0; + for(b = 0; b < sb.size; b += BPB){ + bp = bread(dev, BBLOCK(b, sb)); + for(bi = 0; bi < BPB && b + bi < sb.size; bi++){ + m = 1 << (bi % 8); + if((bp->data[bi/8] & m) == 0){ // Is block free? + bp->data[bi/8] |= m; // Mark block in use. + log_write(bp); + brelse(bp); + bzero(dev, b + bi); + return b + bi; + } + } + brelse(bp); + } + panic("balloc: out of blocks"); +} + +// Free a disk block. +static void +bfree(int dev, uint b) +{ + struct buf *bp; + int bi, m; + + readsb(dev, &sb); + bp = bread(dev, BBLOCK(b, sb)); + bi = b % BPB; + m = 1 << (bi % 8); + if((bp->data[bi/8] & m) == 0) + panic("freeing free block"); + bp->data[bi/8] &= ~m; + log_write(bp); + brelse(bp); +} + +// Inodes. +// +// An inode describes a single unnamed file. +// The inode disk structure holds metadata: the file's type, +// its size, the number of links referring to it, and the +// list of blocks holding the file's content. +// +// The inodes are laid out sequentially on disk at +// sb.startinode. Each inode has a number, indicating its +// position on the disk. +// +// The kernel keeps a cache of in-use inodes in memory +// to provide a place for synchronizing access +// to inodes used by multiple processes. The cached +// inodes include book-keeping information that is +// not stored on disk: ip->ref and ip->valid. +// +// An inode and its in-memory representation go through a +// sequence of states before they can be used by the +// rest of the file system code. +// +// * Allocation: an inode is allocated if its type (on disk) +// is non-zero. ialloc() allocates, and iput() frees if +// the reference and link counts have fallen to zero. +// +// * Referencing in cache: an entry in the inode cache +// is free if ip->ref is zero. Otherwise ip->ref tracks +// the number of in-memory pointers to the entry (open +// files and current directories). iget() finds or +// creates a cache entry and increments its ref; iput() +// decrements ref. +// +// * Valid: the information (type, size, &c) in an inode +// cache entry is only correct when ip->valid is 1. +// ilock() reads the inode from +// the disk and sets ip->valid, while iput() clears +// ip->valid if ip->ref has fallen to zero. +// +// * Locked: file system code may only examine and modify +// the information in an inode and its content if it +// has first locked the inode. +// +// Thus a typical sequence is: +// ip = iget(dev, inum) +// ilock(ip) +// ... examine and modify ip->xxx ... +// iunlock(ip) +// iput(ip) +// +// ilock() is separate from iget() so that system calls can +// get a long-term reference to an inode (as for an open file) +// and only lock it for short periods (e.g., in read()). +// The separation also helps avoid deadlock and races during +// pathname lookup. iget() increments ip->ref so that the inode +// stays cached and pointers to it remain valid. +// +// Many internal file system functions expect the caller to +// have locked the inodes involved; this lets callers create +// multi-step atomic operations. +// +// The icache.lock spin-lock protects the allocation of icache +// entries. Since ip->ref indicates whether an entry is free, +// and ip->dev and ip->inum indicate which i-node an entry +// holds, one must hold icache.lock while using any of those fields. +// +// An ip->lock sleep-lock protects all ip-> fields other than ref, +// dev, and inum. One must hold ip->lock in order to +// read or write that inode's ip->valid, ip->size, ip->type, &c. + +struct { + struct spinlock lock; + struct inode inode[NINODE]; +} icache; + +void +iinit(int dev) +{ + int i = 0; + + initlock(&icache.lock, "icache"); + for(i = 0; i < NINODE; i++) { + initsleeplock(&icache.inode[i].lock, "inode"); + } + + readsb(dev, &sb); + if(sb.magic != FSMAGIC) + panic("invalid file system"); +} + +static struct inode* iget(uint dev, uint inum); + +//PAGEBREAK! +// Allocate an inode on device dev. +// Mark it as allocated by giving it type type. +// Returns an unlocked but allocated and referenced inode. +struct inode* +ialloc(uint dev, short type) +{ + int inum; + struct buf *bp; + struct dinode *dip; + + for(inum = 1; inum < sb.ninodes; inum++){ + bp = bread(dev, IBLOCK(inum, sb)); + dip = (struct dinode*)bp->data + inum%IPB; + if(dip->type == 0){ // a free inode + memset(dip, 0, sizeof(*dip)); + dip->type = type; + log_write(bp); // mark it allocated on the disk + brelse(bp); + return iget(dev, inum); + } + brelse(bp); + } + panic("ialloc: no inodes"); +} + +// Copy a modified in-memory inode to disk. +// Must be called after every change to an ip->xxx field +// that lives on disk, since i-node cache is write-through. +// Caller must hold ip->lock. +void +iupdate(struct inode *ip) +{ + struct buf *bp; + struct dinode *dip; + + bp = bread(ip->dev, IBLOCK(ip->inum, sb)); + dip = (struct dinode*)bp->data + ip->inum%IPB; + dip->type = ip->type; + dip->major = ip->major; + dip->minor = ip->minor; + dip->nlink = ip->nlink; + dip->size = ip->size; + memmove(dip->addrs, ip->addrs, sizeof(ip->addrs)); + log_write(bp); + brelse(bp); +} + +// Find the inode with number inum on device dev +// and return the in-memory copy. Does not lock +// the inode and does not read it from disk. +static struct inode* +iget(uint dev, uint inum) +{ + struct inode *ip, *empty; + + acquire(&icache.lock); + + // Is the inode already cached? + empty = 0; + for(ip = &icache.inode[0]; ip < &icache.inode[NINODE]; ip++){ + if(ip->ref > 0 && ip->dev == dev && ip->inum == inum){ + ip->ref++; + release(&icache.lock); + return ip; + } + if(empty == 0 && ip->ref == 0) // Remember empty slot. + empty = ip; + } + + // Recycle an inode cache entry. + if(empty == 0) + panic("iget: no inodes"); + + ip = empty; + ip->dev = dev; + ip->inum = inum; + ip->ref = 1; + ip->valid = 0; + release(&icache.lock); + + return ip; +} + +// Increment reference count for ip. +// Returns ip to enable ip = idup(ip1) idiom. +struct inode* +idup(struct inode *ip) +{ + acquire(&icache.lock); + ip->ref++; + release(&icache.lock); + return ip; +} + +// Lock the given inode. +// Reads the inode from disk if necessary. +void +ilock(struct inode *ip) +{ + struct buf *bp; + struct dinode *dip; + + if(ip == 0 || ip->ref < 1) + panic("ilock"); + + acquiresleep(&ip->lock); + + if(ip->valid == 0){ + bp = bread(ip->dev, IBLOCK(ip->inum, sb)); + dip = (struct dinode*)bp->data + ip->inum%IPB; + ip->type = dip->type; + ip->major = dip->major; + ip->minor = dip->minor; + ip->nlink = dip->nlink; + ip->size = dip->size; + memmove(ip->addrs, dip->addrs, sizeof(ip->addrs)); + brelse(bp); + ip->valid = 1; + if(ip->type == 0) + panic("ilock: no type"); + } +} + +// Unlock the given inode. +void +iunlock(struct inode *ip) +{ + if(ip == 0 || !holdingsleep(&ip->lock) || ip->ref < 1) + panic("iunlock"); + + releasesleep(&ip->lock); +} + +// Drop a reference to an in-memory inode. +// If that was the last reference, the inode cache entry can +// be recycled. +// If that was the last reference and the inode has no links +// to it, free the inode (and its content) on disk. +// All calls to iput() must be inside a transaction in +// case it has to free the inode. +void +iput(struct inode *ip) +{ + acquire(&icache.lock); + + if(ip->ref == 1 && ip->valid && ip->nlink == 0){ + // inode has no links and no other references: truncate and free. + + // ip->ref == 1 means no other process can have ip locked, + // so this acquiresleep() won't block (or deadlock). + acquiresleep(&ip->lock); + + release(&icache.lock); + + itrunc(ip); + ip->type = 0; + iupdate(ip); + ip->valid = 0; + + releasesleep(&ip->lock); + + acquire(&icache.lock); + } + + ip->ref--; + release(&icache.lock); +} + +// Common idiom: unlock, then put. +void +iunlockput(struct inode *ip) +{ + iunlock(ip); + iput(ip); +} + +//PAGEBREAK! +// Inode content +// +// The content (data) associated with each inode is stored +// in blocks on the disk. The first NDIRECT block numbers +// are listed in ip->addrs[]. The next NINDIRECT blocks are +// listed in block ip->addrs[NDIRECT]. + +// Return the disk block address of the nth block in inode ip. +// If there is no such block, bmap allocates one. +static uint +bmap(struct inode *ip, uint bn) +{ + uint addr, *a; + struct buf *bp; + + if(bn < NDIRECT){ + if((addr = ip->addrs[bn]) == 0) + ip->addrs[bn] = addr = balloc(ip->dev); + return addr; + } + bn -= NDIRECT; + + if(bn < NINDIRECT){ + // Load indirect block, allocating if necessary. + if((addr = ip->addrs[NDIRECT]) == 0) + ip->addrs[NDIRECT] = addr = balloc(ip->dev); + bp = bread(ip->dev, addr); + a = (uint*)bp->data; + if((addr = a[bn]) == 0){ + a[bn] = addr = balloc(ip->dev); + log_write(bp); + } + brelse(bp); + return addr; + } + + panic("bmap: out of range"); +} + +// Truncate inode (discard contents). +// Only called when the inode has no links +// to it (no directory entries referring to it) +// and has no in-memory reference to it (is +// not an open file or current directory). +static void +itrunc(struct inode *ip) +{ + int i, j; + struct buf *bp; + uint *a; + + for(i = 0; i < NDIRECT; i++){ + if(ip->addrs[i]){ + bfree(ip->dev, ip->addrs[i]); + ip->addrs[i] = 0; + } + } + + if(ip->addrs[NDIRECT]){ + bp = bread(ip->dev, ip->addrs[NDIRECT]); + a = (uint*)bp->data; + for(j = 0; j < NINDIRECT; j++){ + if(a[j]) + bfree(ip->dev, a[j]); + } + brelse(bp); + bfree(ip->dev, ip->addrs[NDIRECT]); + ip->addrs[NDIRECT] = 0; + } + + ip->size = 0; + iupdate(ip); +} + +// Copy stat information from inode. +// Caller must hold ip->lock. +void +stati(struct inode *ip, struct stat *st) +{ + st->dev = ip->dev; + st->ino = ip->inum; + st->type = ip->type; + st->nlink = ip->nlink; + st->size = ip->size; +} + +//PAGEBREAK! +// Read data from inode. +// Caller must hold ip->lock. +// If user_dst==1, then dst is a user virtual address; +// otherwise, dst is a kernel address. +int +readi(struct inode *ip, int user_dst, uint64 dst, uint off, uint n) +{ + uint tot, m; + struct buf *bp; + + if(ip->type == T_DEV){ + if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].read) + return -1; + return devsw[ip->major].read(ip, user_dst, dst, n); + } + + if(off > ip->size || off + n < off) + return -1; + if(off + n > ip->size) + n = ip->size - off; + + for(tot=0; tot<n; tot+=m, off+=m, dst+=m){ + bp = bread(ip->dev, bmap(ip, off/BSIZE)); + m = min(n - tot, BSIZE - off%BSIZE); + if(either_copyout(user_dst, dst, bp->data + (off % BSIZE), m) == -1) + break; + brelse(bp); + } + return n; +} + +// PAGEBREAK! +// Write data to inode. +// Caller must hold ip->lock. +// If user_src==1, then src is a user virtual address; +// otherwise, src is a kernel address. +int +writei(struct inode *ip, int user_src, uint64 src, uint off, uint n) +{ + uint tot, m; + struct buf *bp; + + if(ip->type == T_DEV){ + if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].write){ + return -1; + } + return devsw[ip->major].write(ip, user_src, src, n); + } + + if(off > ip->size || off + n < off) + return -1; + if(off + n > MAXFILE*BSIZE) + return -1; + + for(tot=0; tot<n; tot+=m, off+=m, src+=m){ + bp = bread(ip->dev, bmap(ip, off/BSIZE)); + m = min(n - tot, BSIZE - off%BSIZE); + if(either_copyin(bp->data + (off % BSIZE), user_src, src, m) == -1) + break; + log_write(bp); + brelse(bp); + } + + if(n > 0 && off > ip->size){ + ip->size = off; + iupdate(ip); + } + return n; +} + +//PAGEBREAK! +// Directories + +int +namecmp(const char *s, const char *t) +{ + return strncmp(s, t, DIRSIZ); +} + +// Look for a directory entry in a directory. +// If found, set *poff to byte offset of entry. +struct inode* +dirlookup(struct inode *dp, char *name, uint *poff) +{ + uint off, inum; + struct dirent de; + + if(dp->type != T_DIR) + panic("dirlookup not DIR"); + + for(off = 0; off < dp->size; off += sizeof(de)){ + if(readi(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de)) + panic("dirlookup read"); + if(de.inum == 0) + continue; + if(namecmp(name, de.name) == 0){ + // entry matches path element + if(poff) + *poff = off; + inum = de.inum; + return iget(dp->dev, inum); + } + } + + return 0; +} + +// Write a new directory entry (name, inum) into the directory dp. +int +dirlink(struct inode *dp, char *name, uint inum) +{ + int off; + struct dirent de; + struct inode *ip; + + // Check that name is not present. + if((ip = dirlookup(dp, name, 0)) != 0){ + iput(ip); + return -1; + } + + // Look for an empty dirent. + for(off = 0; off < dp->size; off += sizeof(de)){ + if(readi(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de)) + panic("dirlink read"); + if(de.inum == 0) + break; + } + + strncpy(de.name, name, DIRSIZ); + de.inum = inum; + if(writei(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de)) + panic("dirlink"); + + return 0; +} + +//PAGEBREAK! +// Paths + +// Copy the next path element from path into name. +// Return a pointer to the element following the copied one. +// The returned path has no leading slashes, +// so the caller can check *path=='\0' to see if the name is the last one. +// If no name to remove, return 0. +// +// Examples: +// skipelem("a/bb/c", name) = "bb/c", setting name = "a" +// skipelem("///a//bb", name) = "bb", setting name = "a" +// skipelem("a", name) = "", setting name = "a" +// skipelem("", name) = skipelem("////", name) = 0 +// +static char* +skipelem(char *path, char *name) +{ + char *s; + int len; + + while(*path == '/') + path++; + if(*path == 0) + return 0; + s = path; + while(*path != '/' && *path != 0) + path++; + len = path - s; + if(len >= DIRSIZ) + memmove(name, s, DIRSIZ); + else { + memmove(name, s, len); + name[len] = 0; + } + while(*path == '/') + path++; + return path; +} + +// Look up and return the inode for a path name. +// If parent != 0, return the inode for the parent and copy the final +// path element into name, which must have room for DIRSIZ bytes. +// Must be called inside a transaction since it calls iput(). +static struct inode* +namex(char *path, int nameiparent, char *name) +{ + struct inode *ip, *next; + + if(*path == '/') + ip = iget(ROOTDEV, ROOTINO); + else + ip = idup(myproc()->cwd); + + while((path = skipelem(path, name)) != 0){ + ilock(ip); + if(ip->type != T_DIR){ + iunlockput(ip); + return 0; + } + if(nameiparent && *path == '\0'){ + // Stop one level early. + iunlock(ip); + return ip; + } + if((next = dirlookup(ip, name, 0)) == 0){ + iunlockput(ip); + return 0; + } + iunlockput(ip); + ip = next; + } + if(nameiparent){ + iput(ip); + return 0; + } + return ip; +} + +struct inode* +namei(char *path) +{ + char name[DIRSIZ]; + return namex(path, 0, name); +} + +struct inode* +nameiparent(char *path, char *name) +{ + return namex(path, 1, name); +} |