diff --git a/fs/Kconfig b/fs/Kconfig index 2694648..a428126 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -202,6 +202,14 @@ config EXT4DEV_FS_SECURITY If you are not using a security module that requires using extended attributes for file security labels, say N. +config TUX3 + tristate "Tux3 Versioning Filesystem" + help + To compile this filesystem as a module, choose M here: the module will + be called tux3. + + If unsure, say Maybe. + config JBD tristate help diff --git a/fs/Makefile b/fs/Makefile index 1e7a11b..c59a911 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -72,6 +72,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev obj-$(CONFIG_JBD) += jbd/ obj-$(CONFIG_JBD2) += jbd2/ obj-$(CONFIG_EXT2_FS) += ext2/ +obj-$(CONFIG_TUX3) += tux3/ obj-$(CONFIG_CRAMFS) += cramfs/ obj-y += ramfs/ obj-$(CONFIG_HUGETLBFS) += hugetlbfs/ diff --git a/fs/tux3/Makefile b/fs/tux3/Makefile new file mode 100644 index 0000000..bb4b66f --- /dev/null +++ b/fs/tux3/Makefile @@ -0,0 +1,13 @@ +ifeq ($(KERNELRELEASE),) +LINUX = /lib/modules/`uname -r`/build/ + +all: + make -C $(LINUX) M=`pwd` CONFIG_TUX3=m modules +clean: + make -C $(LINUX) M=`pwd` CONFIG_TUX3=m clean +else +obj-$(CONFIG_TUX3) += tux3.o +tux3-objs += balloc.o btree.o dir.o dleaf.o filemap.o hexdump.o iattr.o \ + ileaf.o namei.o inode.o super.o xattr.o +EXTRA_CFLAGS += -std=gnu99 -Wno-declaration-after-statement +endif diff --git a/fs/tux3/balloc.c b/fs/tux3/balloc.c new file mode 100644 index 0000000..acef84f --- /dev/null +++ b/fs/tux3/balloc.c @@ -0,0 +1,269 @@ +/* + * Block allocation + * + * Original copyright (c) 2008 Daniel Phillips + * Portions copyright (c) 2006-2008 Google Inc. + * Licensed under the GPL version 3 + * + * By contributing changes to this file you grant the original copyright holder + * the right to distribute those changes under any license. + */ + +#include "tux3.h" + +#ifndef trace +#define trace trace_off +#endif + +static void set_bits(u8 *bitmap, unsigned start, unsigned count) +{ + unsigned limit = start + count; + unsigned lmask = (-1 << (start & 7)) & 0xff; // little endian!!! + unsigned rmask = ~(-1 << (limit & 7)) & 0xff; // little endian!!! + unsigned loff = start >> 3, roff = limit >> 3; + if (loff == roff) { + bitmap[loff] |= lmask & rmask; + return; + } + bitmap[loff] |= lmask; + memset(bitmap + loff + 1, -1, roff - loff - 1); + if (rmask) + bitmap[roff] |= rmask; +} + +static void clear_bits(u8 *bitmap, unsigned start, unsigned count) +{ + unsigned limit = start + count; + unsigned lmask = (-1 << (start & 7)) & 0xff; // little endian!!! + unsigned rmask = ~(-1 << (limit & 7)) & 0xff; // little endian!!! + unsigned loff = start >> 3, roff = limit >> 3; + if (loff == roff) { + bitmap[loff] &= ~lmask | ~rmask; + return; + } + bitmap[loff] &= ~lmask; + memset(bitmap + loff + 1, 0, roff - loff - 1); + if (rmask) + bitmap[roff] &= ~rmask; +} + +static int all_set(u8 *bitmap, unsigned start, unsigned count) +{ + unsigned limit = start + count; + unsigned lmask = (-1 << (start & 7)) & 0xff; // little endian!!! + unsigned rmask = ~(-1 << (limit & 7)) & 0xff; // little endian!!! + unsigned loff = start >> 3, roff = limit >> 3; + if (loff == roff) { + unsigned mask = lmask & rmask; + return (bitmap[loff] & mask) == mask; + } + for (unsigned i = loff + 1; i < roff; i++) + if (bitmap[i] != 0xff) + return 0; + return (bitmap[loff] & lmask) == lmask && + (!rmask || (bitmap[roff] & rmask) == rmask); +} + +static int bytebits(unsigned char c) +{ + unsigned count = 0; + for (; c; c >>= 1) + count += c & 1; + return count; +} + +/* userland only */ +block_t count_range(struct inode *inode, block_t start, block_t count) +{ + assert(!(start & 7)); + unsigned char ones[256]; + for (int i = 0; i < sizeof(ones); i++) + ones[i] = bytebits(i); + + block_t limit = start + count; + unsigned blocksize = 1 << tux_sb(inode->i_sb)->blockbits; + unsigned mapshift = tux_sb(inode->i_sb)->blockbits + 3; + unsigned mapmask = (1 << mapshift) - 1; + unsigned blocks = (limit + mapmask) >> mapshift; + unsigned offset = (start & mapmask) >> 3; + block_t tail = (count + 7) >> 3, total = 0; + + for (unsigned block = start >> mapshift; block < blocks; block++) { + //printf("count block %x/%x\n", block, blocks); + struct buffer_head *buffer = blockread(mapping(inode), block); + if (!buffer) + return -1; + unsigned bytes = blocksize - offset; + if (bytes > tail) + bytes = tail; + unsigned char *p = bufdata(buffer) + offset, *top = p + bytes; + while (p < top) + total += ones[*p++]; + brelse(buffer); + tail -= bytes; + offset = 0; + } + return total; +} + +/* userland only */ +block_t bitmap_dump(struct inode *inode, block_t start, block_t count) +{ + block_t limit = start + count; + unsigned blocksize = 1 << tux_sb(inode->i_sb)->blockbits; + unsigned mapshift = tux_sb(inode->i_sb)->blockbits + 3; + unsigned mapmask = (1 << mapshift) - 1; + unsigned blocks = (limit + mapmask) >> mapshift, active = 0; + unsigned offset = (start & mapmask) >> 3; + unsigned startbit = start & 7; + block_t tail = (count + startbit + 7) >> 3, begin = -1; + + printf("%i bitmap blocks:\n", blocks); + for (unsigned block = start >> mapshift; block < blocks; block++) { + int ended = 0, any = 0; + struct buffer_head *buffer = blockread(mapping(inode), block); + if (!buffer) + return -1; + unsigned bytes = blocksize - offset; + if (bytes > tail) + bytes = tail; + unsigned char *p = bufdata(buffer) + offset, *top = p + bytes; + for (; p < top; p++, startbit = 0) { + unsigned c = *p; + if (!any && c) + printf("[%x] ", block); + any |= c; + if ((!c && begin < 0) || (c == 0xff && begin >= 0)) + continue; + for (int i = startbit, mask = 1 << startbit; i < 8; i++, mask <<= 1) { + if (!(c & mask) == (begin < 0)) + continue; + block_t found = i + (((void *)p - bufdata(buffer)) << 3) + (block << mapshift); + if (begin < 0) + begin = found; + else { + if ((begin >> mapshift) != block) + printf("-%Lx ", (L)(found - 1)); + else if (begin == found - 1) + printf("%Lx ", (L)begin); + else + printf("%Lx-%Lx ", (L)begin, (L)(found - 1)); + begin = -1; + ended++; + } + } + } + active += !!any; + brelse(buffer); + tail -= bytes; + offset = 0; + if (begin >= 0) + printf("%Lx-", (L)begin); + if (any) + printf("\n"); + } + printf("(%i active)\n", active); + return -1; +} + +static block_t balloc_from_range(struct inode *inode, block_t start, unsigned count, unsigned blocks) +{ + trace("balloc %i blocks from [%Lx/%Lx]", blocks, (L)start, (L)count); + assert(blocks > 0); + block_t limit = start + count; + unsigned blocksize = 1 << tux_sb(inode->i_sb)->blockbits; + unsigned mapshift = tux_sb(inode->i_sb)->blockbits + 3; + unsigned mapmask = (1 << mapshift) - 1; + unsigned mapblocks = (limit + mapmask) >> mapshift; + unsigned offset = (start & mapmask) >> 3; + unsigned startbit = start & 7; + block_t tail = (count + startbit + 7) >> 3; + for (unsigned mapblock = start >> mapshift; mapblock < mapblocks; mapblock++) { + trace_off("search mapblock %x/%x", mapblock, mapblocks); + struct buffer_head *buffer = blockread(mapping(inode), mapblock); + if (!buffer) + return -1; + unsigned bytes = blocksize - offset, run = 0; + if (bytes > tail) + bytes = tail; + unsigned char *p = bufdata(buffer) + offset, *top = p + bytes, c; + for (; p < top; p++, startbit = 0) { + if ((c = *p) == 0xff) { + run = 0; + continue; + } + for (int i = startbit, mask = 1 << startbit; i < 8; i++, mask <<= 1) { + if ((c & mask)) { + run = 0; + continue; + } + if (++run < blocks) + continue; + assert(run == blocks); + block_t found = i + (((void *)p - bufdata(buffer)) << 3) + (mapblock << mapshift); + if (found >= limit) { + assert(mapblock == mapblocks - 1); + goto final_partial_byte; + } + found -= run - 1; + set_bits(bufdata(buffer), found & mapmask, run); + brelse_dirty(buffer); + tux_sb(inode->i_sb)->nextalloc = found + run; + tux_sb(inode->i_sb)->freeblocks -= run; + //set_sb_dirty(sb); + return found; + } + } +final_partial_byte: + brelse(buffer); + tail -= bytes; + offset = 0; + } + return -1; +} + +block_t balloc(struct sb *sb, unsigned blocks) +{ + assert(blocks > 0); + trace_off("balloc %x blocks at goal %Lx", blocks, (L)sb->nextalloc); + mutex_lock(&sb->bitmap->i_mutex); + block_t goal = sb->nextalloc, total = sb->volblocks, block; + if ((block = balloc_from_range(sb->bitmap, goal, total - goal, blocks)) >= 0) + goto found; + if ((block = balloc_from_range(sb->bitmap, 0, goal, blocks)) >= 0) + goto found; + mutex_unlock(&sb->bitmap->i_mutex); + return -1; +found: + printf("balloc extent -> [%Lx/%x]\n", (L)block, blocks); + mutex_unlock(&sb->bitmap->i_mutex); + return block; +} + +void bfree(struct sb *sb, block_t start, unsigned blocks) +{ + assert(blocks > 0); + unsigned mapshift = sb->blockbits + 3; + unsigned mapmask = (1 << mapshift) - 1; + unsigned mapblock = start >> mapshift; + char *why = "could not read bitmap buffer"; + mutex_lock(&sb->bitmap->i_mutex); + struct buffer_head *buffer = blockread(mapping(sb->bitmap), mapblock); + printf("free <- [%Lx]\n", (L)start); + if (!buffer) + goto eek; + if (!all_set(bufdata(buffer), start &= mapmask, blocks)) + goto eeek; + clear_bits(bufdata(buffer), start, blocks); + brelse_dirty(buffer); + sb->freeblocks += blocks; + //set_sb_dirty(sb); + mutex_unlock(&sb->bitmap->i_mutex); + return; +eeek: + why = "blocks already free"; + brelse(buffer); +eek: + warn("extent 0x%Lx %s!\n", (L)start, why); + mutex_unlock(&sb->bitmap->i_mutex); +} diff --git a/fs/tux3/btree.c b/fs/tux3/btree.c new file mode 100644 index 0000000..de363e5 --- /dev/null +++ b/fs/tux3/btree.c @@ -0,0 +1,691 @@ +/* + * Generic btree operations + * + * Original copyright (c) 2008 Daniel Phillips + * Portions copyright (c) 2006-2008 Google Inc. + * Licensed under the GPL version 3 + * + * By contributing changes to this file you grant the original copyright holder + * the right to distribute those changes under any license. + */ + +#include "tux3.h" + +#ifndef trace +#define trace trace_off +#endif + +struct bnode +{ + be_u32 count, unused; + struct index_entry { be_u64 key; be_u64 block; } PACKED entries[]; +} PACKED; +/* + * Note that the first key of an index block is never accessed. This is + * because for a btree, there is always one more key than nodes in each + * index node. In other words, keys lie between node pointers. I + * micro-optimize by placing the node count in the first key, which allows + * a node to contain an esthetically pleasing binary number of pointers. + * (Not done yet.) + */ + +static inline unsigned bcount(struct bnode *node) +{ + return from_be_u32(node->count); +} + +// desperately need ERR_PTR return here to distinguish between +// ENOMEM, which should be impossible but when it happens we +// need to do something reasonable, or ENOSPC which we must +// just report and keep going without a fuss. +static struct buffer_head *new_block(struct btree *btree) +{ + block_t block = (btree->ops->balloc)(btree->sb, 1); + if (block == -1) + return NULL; + struct buffer_head *buffer = sb_getblk(vfs_sb(btree->sb), block); + if (!buffer) + return NULL; + memset(bufdata(buffer), 0, bufsize(buffer)); + /* FIXME */ + set_buffer_uptodate(buffer); + return buffer; +} + +struct buffer_head *new_leaf(struct btree *btree) +{ + struct buffer_head *buffer = new_block(btree); + if (buffer) + (btree->ops->leaf_init)(btree, bufdata(buffer)); + return buffer; +} + +static struct buffer_head *new_node(struct btree *btree) +{ + struct buffer_head *buffer = new_block(btree); + if (buffer) + ((struct bnode *)bufdata(buffer))->count = 0; + return buffer; +} + +/* + * A btree cursor has n + 1 entries for a btree of depth n, with the first n + * entries pointing at internal nodes and entry n + 1 pointing at a leaf. + * The next field points at the next index entry that will be loaded in a left + * to right tree traversal, not the current entry. The next pointer is null + * for the leaf, which has its own specialized traversal algorithms. + */ + +static inline struct bnode *cursor_node(struct cursor *cursor, int level) +{ + return bufdata(cursor->path[level].buffer); +} + +struct buffer_head *cursor_leafbuf(struct cursor *cursor) +{ + assert(cursor->len >= 2); /* root-bnode + leaf >= 2 */ + return cursor->path[cursor->len - 1].buffer; +} + +static void level_root_add(struct cursor *cursor, struct buffer_head *buffer, + struct index_entry *next) +{ +#ifdef CURSOR_DEBUG + assert(cursor->len < cursor->maxlen); + assert(cursor->path[cursor->len].buffer == FREE_BUFFER); + assert(cursor->path[cursor->len].next == FREE_NEXT); +#endif + vecmove(cursor->path + 1, cursor->path, cursor->len); + cursor->len++; + cursor->path[0].buffer = buffer; + cursor->path[0].next = next; +} + +static void level_replace_brelse(struct cursor *cursor, int level, struct buffer_head *buffer, struct index_entry *next) +{ +#ifdef CURSOR_DEBUG + assert(buffer); + assert(level < cursor->len); + assert(cursor->path[level].buffer != FREE_BUFFER); + assert(cursor->path[level].next != FREE_NEXT); +#endif + brelse(cursor->path[level].buffer); + cursor->path[level].buffer = buffer; + cursor->path[level].next = next; +} + +void level_push(struct cursor *cursor, struct buffer_head *buffer, struct index_entry *next) +{ +#ifdef CURSOR_DEBUG + assert(cursor->len < cursor->maxlen); + assert(cursor->path[cursor->len].buffer == FREE_BUFFER); + assert(cursor->path[cursor->len].next == FREE_NEXT); +#endif + cursor->path[cursor->len].buffer = buffer; + cursor->path[cursor->len].next = next; + cursor->len++; +} + +static struct buffer_head *level_pop(struct cursor *cursor) +{ + struct buffer_head *buffer; +#ifdef CURSOR_DEBUG + assert(cursor->len > 0); +#endif + cursor->len--; + buffer = cursor->path[cursor->len].buffer; +#ifdef CURSOR_DEBUG + cursor->path[cursor->len].buffer = FREE_BUFFER; + cursor->path[cursor->len].next = FREE_NEXT; +#endif + return buffer; +} + +static void level_pop_brelse(struct cursor *cursor) +{ + brelse(level_pop(cursor)); +} + +void level_pop_brelse_dirty(struct cursor *cursor) +{ + brelse_dirty(level_pop(cursor)); +} + +static inline int level_finished(struct cursor *cursor, int level) +{ + struct bnode *node = cursor_node(cursor, level); + return cursor->path[level].next == node->entries + bcount(node); +} +// also write level_beginning!!! + +void release_cursor(struct cursor *cursor) +{ + while (cursor->len) + level_pop_brelse(cursor); +} + +/* unused */ +void show_cursor(struct cursor *cursor, int depth) +{ + printf(">>> cursor %p/%i:", cursor, depth); + for (int i = 0; i < depth; i++) + printf(" [%Lx/%i]", (L)bufindex(cursor->path[i].buffer), bufcount(cursor->path[i].buffer)); + printf("\n"); +} + +static void cursor_check(struct cursor *cursor) +{ + if (cursor->len == 0) + return; + tuxkey_t key = 0; + block_t block = cursor->btree->root.block; + for (int i = 0; i < cursor->len; i++) { + assert(bufindex(cursor->path[i].buffer) == block); + if (!cursor->path[i].next) + break; + struct bnode *node = cursor_node(cursor, i); + assert(node->entries < cursor->path[i].next); + assert(cursor->path[i].next <= node->entries + bcount(node)); + assert(from_be_u64((cursor->path[i].next - 1)->key) >= key); + block = from_be_u64((cursor->path[i].next - 1)->block); + key = from_be_u64((cursor->path[i].next - 1)->key); + } +} + +static inline int alloc_cursor_size(int maxlevel) +{ + return sizeof(struct cursor) + sizeof(struct path_level) * maxlevel; +} + +struct cursor *alloc_cursor(struct btree *btree, int extra) +{ + int maxlevel = btree->root.depth + 1 + extra; + struct cursor *cursor = malloc(alloc_cursor_size(maxlevel)); + if (cursor) { + cursor->btree = btree; + cursor->len = 0; +#ifdef CURSOR_DEBUG + cursor->maxlen = maxlevel; + for (int i = 0; i < maxlevel; i++) { + cursor->path[i].buffer = FREE_BUFFER; /* for debug */ + cursor->path[i].next = FREE_NEXT; /* for debug */ + } +#endif + } + return cursor; +} + +void free_cursor(struct cursor *cursor) +{ +#ifdef CURSOR_DEBUG + assert(cursor->len == 0); +#endif + free(cursor); +} + +int probe(struct btree *btree, tuxkey_t key, struct cursor *cursor) +{ + unsigned i, depth = btree->root.depth; + struct buffer_head *buffer = sb_bread(vfs_sb(btree->sb), btree->root.block); + if (!buffer) + return -EIO; + struct bnode *node = bufdata(buffer); + + for (i = 0; i < depth; i++) { + struct index_entry *next = node->entries, *top = next + bcount(node); + while (++next < top) /* binary search goes here */ + if (from_be_u64(next->key) > key) + break; + //printf("probe level %i, %ti of %i\n", i, next - node->entries, bcount(node)); + level_push(cursor, buffer, next); + if (!(buffer = sb_bread(vfs_sb(btree->sb), from_be_u64((next - 1)->block)))) + goto eek; + node = (struct bnode *)bufdata(buffer); + } + assert((btree->ops->leaf_sniff)(btree, bufdata(buffer))); + level_push(cursor, buffer, NULL); + cursor_check(cursor); + return 0; +eek: + release_cursor(cursor); + return -EIO; /* stupid, it might have been NOMEM */ +} + +int advance(struct btree *btree, struct cursor *cursor) +{ + int depth = btree->root.depth, level = depth; + struct buffer_head *buffer; + do { + level_pop_brelse(cursor); + if (!level) + return 0; + level--; + } while (level_finished(cursor, level)); + while (1) { + buffer = sb_bread(vfs_sb(btree->sb), from_be_u64(cursor->path[level].next->block)); + if (!buffer) + goto eek; + cursor->path[level].next++; + if (level + 1 == depth) + break; + level_push(cursor, buffer, ((struct bnode *)bufdata(buffer))->entries); + level++; + } + level_push(cursor, buffer, NULL); + cursor_check(cursor); + return 1; +eek: + release_cursor(cursor); + return -EIO; +} + +/* + * Climb up the cursor until we find the first level where we have not yet read + * all the way to the end of the index block, there we find the key that + * separates the subtree we are in (a leaf) from the next subtree to the right. + */ +static be_u64 *next_keyp(struct cursor *cursor, int depth) +{ + for (int level = depth; level--;) + if (!level_finished(cursor, level)) + return &cursor->path[level].next->key; + return NULL; +} + +tuxkey_t next_key(struct cursor *cursor, int depth) +{ + be_u64 *keyp = next_keyp(cursor, depth); + return keyp ? from_be_u64(*keyp) : -1; +} +// also write this_key!!! + +void show_tree_range(struct btree *btree, tuxkey_t start, unsigned count) +{ + printf("%i level btree at %Li:\n", btree->root.depth, (L)btree->root.block); + struct cursor *cursor = alloc_cursor(btree, 0); + if (!cursor) + error("out of memory"); + if (probe(btree, start, cursor)) + error("tell me why!!!"); + struct buffer_head *buffer; + do { + buffer = cursor_leafbuf(cursor); + assert((btree->ops->leaf_sniff)(btree, bufdata(buffer))); + (btree->ops->leaf_dump)(btree, bufdata(buffer)); + //tuxkey_t *next = pnext_key(cursor, btree->depth); + //printf("next key = %Lx:\n", next ? (L)*next : 0); + } while (--count && advance(btree, cursor)); + free_cursor(cursor); +} + +void show_tree(struct btree *btree) +{ + show_tree_range(btree, 0, -1); +} + +/* Deletion */ + +static void remove_index(struct cursor *cursor, int level) +{ + struct bnode *node = cursor_node(cursor, level); + int count = bcount(node), i; + + /* stomps the node count (if 0th key holds count) */ + memmove(cursor->path[level].next - 1, cursor->path[level].next, + (char *)&node->entries[count] - (char *)cursor->path[level].next); + node->count = to_be_u32(count - 1); + --(cursor->path[level].next); + mark_buffer_dirty(cursor->path[level].buffer); + + /* no separator for last entry */ + if (level_finished(cursor, level)) + return; + /* + * Climb up to common parent and set separating key to deleted key. + * What if index is now empty? (no deleted key) + * Then some key above is going to be deleted and used to set sep + * Climb the cursor while at first entry, bail out at root + * find the node with the old sep, set it to deleted key + */ + if (cursor->path[level].next == node->entries && level) { + be_u64 sep = (cursor->path[level].next)->key; + for (i = level - 1; cursor->path[i].next - 1 == cursor_node(cursor, i)->entries; i--) + if (!i) + return; + (cursor->path[i].next - 1)->key = sep; + mark_buffer_dirty(cursor->path[i].buffer); + } +} + +static void merge_nodes(struct bnode *node, struct bnode *node2) +{ + veccopy(&node->entries[bcount(node)], node2->entries, bcount(node2)); + node->count = to_be_u32(bcount(node) + bcount(node2)); +} + +static void brelse_free(struct btree *btree, struct buffer_head *buffer) +{ + struct sb *sb = btree->sb; + block_t block = bufindex(buffer); + if (bufcount(buffer) != 1) { + warn("free block %Lx still in use!", (L)bufindex(buffer)); + brelse(buffer); + assert(bufcount(buffer) == 0); + return; + } + brelse(buffer); + (btree->ops->bfree)(sb, block, 1); + set_buffer_empty(buffer); // free it!!! (and need a buffer free state) +} + +int tree_chop(struct btree *btree, struct delete_info *info, millisecond_t deadline) +{ + int depth = btree->root.depth, level = depth - 1, suspend = 0; + struct cursor *cursor; + struct buffer_head *leafbuf, **prev, *leafprev = NULL; + struct btree_ops *ops = btree->ops; + struct sb *sb = btree->sb; + int ret; + + cursor = alloc_cursor(btree, 0); + prev = malloc(sizeof(*prev) * depth); + memset(prev, 0, sizeof(*prev) * depth); + + down_write(&btree->lock); + probe(btree, info->resume, cursor); + leafbuf = level_pop(cursor); + + /* leaf walk */ + while (1) { + ret = (ops->leaf_chop)(btree, info->key, bufdata(leafbuf)); + if (ret) { + mark_buffer_dirty(leafbuf); + if (ret < 0) + goto out; + } + + /* try to merge this leaf with prev */ + if (leafprev) { + struct vleaf *this = bufdata(leafbuf); + struct vleaf *that = bufdata(leafprev); + /* try to merge leaf with prev */ + if ((ops->leaf_need)(btree, this) <= (ops->leaf_free)(btree, that)) { + trace(">>> can merge leaf %p into leaf %p", leafbuf, leafprev); + (ops->leaf_merge)(btree, that, this); + remove_index(cursor, level); + mark_buffer_dirty(leafprev); + brelse_free(btree, leafbuf); + //dirty_buffer_count_check(sb); + goto keep_prev_leaf; + } + brelse(leafprev); + } + leafprev = leafbuf; +keep_prev_leaf: + + //nanosleep(&(struct timespec){ 0, 50 * 1000000 }, NULL); + //printf("time remaining: %Lx\n", deadline - gettime()); +// if (deadline && gettime() > deadline) +// suspend = -1; + if (info->blocks && info->freed >= info->blocks) + suspend = -1; + + /* pop and try to merge finished nodes */ + while (suspend || level_finished(cursor, level)) { + /* try to merge node with prev */ + if (prev[level]) { + assert(level); /* node has no prev */ + struct bnode *this = cursor_node(cursor, level); + struct bnode *that = bufdata(prev[level]); + trace_off("check node %p against %p", this, that); + trace_off("this count = %i prev count = %i", bcount(this), bcount(that)); + /* try to merge with node to left */ + if (bcount(this) <= sb->entries_per_node - bcount(that)) { + trace(">>> can merge node %p into node %p", this, that); + merge_nodes(that, this); + remove_index(cursor, level - 1); + mark_buffer_dirty(prev[level]); + brelse_free(btree, level_pop(cursor)); + //dirty_buffer_count_check(sb); + goto keep_prev_node; + } + brelse(prev[level]); + } + prev[level] = level_pop(cursor); +keep_prev_node: + + /* deepest key in the cursor is the resume address */ + if (suspend == -1 && !level_finished(cursor, level)) { + suspend = 1; /* only set resume once */ + info->resume = from_be_u64((cursor->path[level].next)->key); + } + if (!level) { /* remove depth if possible */ + while (depth > 1 && bcount(bufdata(prev[0])) == 1) { + trace("drop btree level"); + btree->root.block = bufindex(prev[1]); + brelse_free(btree, prev[0]); + //dirty_buffer_count_check(sb); + depth = --btree->root.depth; + vecmove(prev, prev + 1, depth); + //set_sb_dirty(sb); + } + //sb->snapmask &= ~snapmask; delete_snapshot_from_disk(); + //set_sb_dirty(sb); + //save_sb(sb); + ret = suspend; + goto out; + } + level--; + trace_off(printf("pop to level %i, block %Lx, %i of %i nodes\n", level, bufindex(cursor->path[level].buffer), cursor->path[level].next - cursor_node(cursor, level)->entries, bcount(cursor_node(cursor, level)));); + } + + /* push back down to leaf level */ + while (level < depth - 1) { + struct buffer_head *buffer = sb_bread(vfs_sb(sb), from_be_u64(cursor->path[level++].next++->block)); + if (!buffer) { + ret = -EIO; + goto out; + } + level_push(cursor, buffer, ((struct bnode *)bufdata(buffer))->entries); + trace_off(printf("push to level %i, block %Lx, %i nodes\n", level, bufindex(buffer), bcount(cursor_node(cursor, level)));); + } + //dirty_buffer_count_check(sb); + /* go to next leaf */ + if (!(leafbuf = sb_bread(vfs_sb(sb), from_be_u64(cursor->path[level].next++->block)))) { + ret = -EIO; + goto out; + } + } + +out: + if (leafprev) + brelse(leafprev); + for (int i = 0; i < btree->root.depth; i++) { + if (prev[i]) + brelse(prev[i]); + } + free(prev); + release_cursor(cursor); + up_write(&btree->lock); + free_cursor(cursor); + return ret; +} + +/* Insertion */ + +static void add_child(struct bnode *node, struct index_entry *p, block_t child, u64 childkey) +{ + vecmove(p + 1, p, node->entries + bcount(node) - p); + p->block = to_be_u64(child); + p->key = to_be_u64(childkey); + node->count = to_be_u32(bcount(node) + 1); +} + +/* + * Insert new leaf to next cursor position. + * keep == 1: keep current cursor position. + * keep == 0, set cursor position to new leaf. + */ +static int insert_leaf(struct cursor *cursor, tuxkey_t childkey, struct buffer_head *leafbuf, int keep) +{ + struct btree *btree = cursor->btree; + int depth = btree->root.depth; + block_t childblock = bufindex(leafbuf); + if (keep) + brelse(leafbuf); + else { + level_pop_brelse(cursor); + level_push(cursor, leafbuf, NULL); + } + while (depth--) { + struct path_level *at = cursor->path + depth; + struct buffer_head *parentbuf = at->buffer; + struct bnode *parent = bufdata(parentbuf); + + /* insert and exit if not full */ + if (bcount(parent) < btree->sb->entries_per_node) { + add_child(parent, at->next, childblock, childkey); + if (!keep) + at->next++; + mark_buffer_dirty(parentbuf); + return 0; + } + + /* split a full index node */ + struct buffer_head *newbuf = new_node(btree); + if (!newbuf) + goto eek; + struct bnode *newnode = bufdata(newbuf); + unsigned half = bcount(parent) / 2; + u64 newkey = from_be_u64(parent->entries[half].key); + newnode->count = to_be_u32(bcount(parent) - half); + memcpy(&newnode->entries[0], &parent->entries[half], bcount(newnode) * sizeof(struct index_entry)); + parent->count = to_be_u32(half); + + /* if the cursor is in the new node, use that as the parent */ + if (at->next > parent->entries + half) { + struct index_entry *newnext; + mark_buffer_dirty(parentbuf); + newnext = newnode->entries + (at->next - &parent->entries[half]); + get_bh(newbuf); + level_replace_brelse(cursor, depth, newbuf, newnext); + parentbuf = newbuf; + parent = newnode; + } else + mark_buffer_dirty(newbuf); + add_child(parent, at->next, childblock, childkey); + if (!keep) + at->next++; + mark_buffer_dirty(parentbuf); + childkey = newkey; + childblock = bufindex(newbuf); + brelse(newbuf); + } + trace("add tree level"); + struct buffer_head *newbuf = new_node(btree); + if (!newbuf) + goto eek; + struct bnode *newroot = bufdata(newbuf); + int left_node = bufindex(cursor->path[0].buffer) != childblock; + newroot->count = to_be_u32(2); + newroot->entries[0].block = to_be_u64(btree->root.block); + newroot->entries[1].key = to_be_u64(childkey); + newroot->entries[1].block = to_be_u64(childblock); + btree->root.block = bufindex(newbuf); + btree->root.depth++; + level_root_add(cursor, newbuf, newroot->entries + 1 + !left_node); + //set_sb_dirty(sb); + mark_buffer_dirty(newbuf); + cursor_check(cursor); + return 0; +eek: + release_cursor(cursor); + return -ENOMEM; +} + +/* Insert new leaf to next cursor position, then set cursor to new leaf */ +int btree_insert_leaf(struct cursor *cursor, tuxkey_t key, struct buffer_head *leafbuf) +{ + return insert_leaf(cursor, key, leafbuf, 0); +} + +int btree_leaf_split(struct btree *btree, struct cursor *cursor, tuxkey_t key) +{ + trace("split leaf"); + struct buffer_head *newbuf = new_leaf(btree); + if (!newbuf) { + /* the rule: release cursor at point of error */ + release_cursor(cursor); + return -ENOMEM; + } + struct buffer_head *leafbuf = cursor_leafbuf(cursor); + tuxkey_t newkey = (btree->ops->leaf_split)(btree, key, bufdata(leafbuf), bufdata(newbuf)); + int keep; + trace_off("use upper? %Li %Li", key, newkey); + if (key >= newkey) { + mark_buffer_dirty(leafbuf); + keep = 0; + } else { + mark_buffer_dirty(newbuf); + keep = 1; + } + return insert_leaf(cursor, newkey, newbuf, keep); +} + +void *tree_expand(struct btree *btree, tuxkey_t key, unsigned newsize, struct cursor *cursor) +{ + for (int i = 0; i < 2; i++) { + struct buffer_head *leafbuf = cursor_leafbuf(cursor); + void *space = (btree->ops->leaf_resize)(btree, key, bufdata(leafbuf), newsize); + if (space) + return space; + assert(!i); + int err = btree_leaf_split(btree, cursor, key); + if (err) { + warn("insert_node failed (%d)", err); + break; + } + } + return NULL; +} + +void init_btree(struct btree *btree, struct sb *sb, struct root root, struct btree_ops *ops) +{ + btree->sb = sb; + btree->ops = ops; + btree->root = root; + init_rwsem(&btree->lock); + ops->btree_init(btree); +} + +int new_btree(struct btree *btree, struct sb *sb, struct btree_ops *ops) +{ + /* Initialize btree with dummy root */ + init_btree(btree, sb, (struct root){}, ops); + + struct buffer_head *rootbuf = new_node(btree); + struct buffer_head *leafbuf = new_leaf(btree); + if (!rootbuf || !leafbuf) + goto eek; + struct bnode *rootnode = bufdata(rootbuf); + rootnode->entries[0].block = to_be_u64(bufindex(leafbuf)); + rootnode->count = to_be_u32(1); + printf("root at %Lx\n", (L)bufindex(rootbuf)); + printf("leaf at %Lx\n", (L)bufindex(leafbuf)); + brelse_dirty(rootbuf); + brelse_dirty(leafbuf); + btree->root = (struct root){ .block = bufindex(rootbuf), .depth = 1 }; + return 0; +eek: + if (rootbuf) + brelse(rootbuf); + if (leafbuf) + brelse(leafbuf); + return -ENOMEM; +} + +/* userland only */ +void free_btree(struct btree *btree) +{ + // write me +} diff --git a/fs/tux3/dir.c b/fs/tux3/dir.c new file mode 100644 index 0000000..759f9f9 --- /dev/null +++ b/fs/tux3/dir.c @@ -0,0 +1,346 @@ +/* Lifted from Ext2, blush. GPLv2. Portions (c) Daniel Phillips 2008 */ + +/* + * linux/include/linux/ext2_fs.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * linux/fs/ext2/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext2 directory handling functions + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * All code that works with directory layout had been switched to pagecache + * and moved here. AV + */ + +#include "tux3.h" + +#define TUX_DIR_PAD (sizeof(inum_t) - 1) +#define TUX_DIR_HEAD (offsetof(tux_dirent, name)) +#define TUX_REC_LEN(name_len) (((name_len) + TUX_DIR_HEAD + TUX_DIR_PAD) & ~TUX_DIR_PAD) +#define TUX_MAX_REC_LEN ((1<<16)-1) + +static inline unsigned tux_rec_len_from_disk(be_u16 dlen) +{ + unsigned len = from_be_u16(dlen); + if (len == TUX_MAX_REC_LEN) + return 1 << 16; + return len; +} + +static inline be_u16 tux_rec_len_to_disk(unsigned len) +{ + if (len == (1 << 16)) + return to_be_u16(TUX_MAX_REC_LEN); + else if (len > (1 << 16)) + error("oops"); + return to_be_u16(len); +} + +static inline int is_deleted(tux_dirent *entry) +{ + return !entry->name_len; /* ext2 uses !inum for this */ +} + +static inline int tux_match(tux_dirent *entry, const char *const name, int len) +{ + if (len != entry->name_len) + return 0; + if (is_deleted(entry)) + return 0; + return !memcmp(name, entry->name, len); +} + +static inline tux_dirent *next_entry(tux_dirent *entry) +{ + return (tux_dirent *)((char *)entry + tux_rec_len_from_disk(entry->rec_len)); +} + +enum { + TUX_UNKNOWN, + TUX_REG, + TUX_DIR, + TUX_CHR, + TUX_BLK, + TUX_FIFO, + TUX_SOCK, + TUX_LNK, + TUX_TYPES +}; + +#define STAT_SHIFT 12 + +static unsigned char tux_type_by_mode[S_IFMT >> STAT_SHIFT] = { + [S_IFREG >> STAT_SHIFT] = TUX_REG, + [S_IFDIR >> STAT_SHIFT] = TUX_DIR, + [S_IFCHR >> STAT_SHIFT] = TUX_CHR, + [S_IFBLK >> STAT_SHIFT] = TUX_BLK, + [S_IFIFO >> STAT_SHIFT] = TUX_FIFO, + [S_IFSOCK >> STAT_SHIFT] = TUX_SOCK, + [S_IFLNK >> STAT_SHIFT] = TUX_LNK, +}; + +int tux_update_entry(struct buffer_head *buffer, tux_dirent *entry, inum_t inum, unsigned mode) +{ + struct inode *dir = buffer_inode(buffer); + entry->inum = to_be_u64(inum); + entry->type = tux_type_by_mode[(mode & S_IFMT) >> STAT_SHIFT]; + brelse_dirty(buffer); + dir->i_mtime = dir->i_ctime = gettime(); + mark_inode_dirty(dir); + return 0; +} + +loff_t _tux_create_entry(struct inode *dir, const char *name, int len, inum_t inum, unsigned mode, loff_t *size) +{ + tux_dirent *entry; + struct buffer_head *buffer; + unsigned reclen = TUX_REC_LEN(len), rec_len, name_len, offset; + unsigned blockbits = tux_sb(dir->i_sb)->blockbits, blocksize = 1 << blockbits; + unsigned blocks = *size >> blockbits, block; + for (block = 0; block < blocks; block++) { + buffer = blockread(mapping(dir), block); + if (!buffer) + return -EIO; + entry = bufdata(buffer); + tux_dirent *limit = bufdata(buffer) + blocksize - reclen; + while (entry <= limit) { + if (entry->rec_len == 0) { + brelse(buffer); + tux_error(dir->i_sb, "zero-length directory entry"); + return -EIO; + } + name_len = TUX_REC_LEN(entry->name_len); + rec_len = tux_rec_len_from_disk(entry->rec_len); + if (is_deleted(entry) && rec_len >= reclen) + goto create; + if (rec_len >= name_len + reclen) + goto create; + entry = (tux_dirent *)((char *)entry + rec_len); + } + brelse(buffer); + } + buffer = blockget(mapping(dir), block = blocks); + entry = bufdata(buffer); + name_len = 0; + rec_len = blocksize; + *entry = (tux_dirent){ .rec_len = tux_rec_len_to_disk(blocksize) }; + *size += blocksize; +create: + if (!is_deleted(entry)) { + tux_dirent *newent = (tux_dirent *)((char *)entry + name_len); + newent->rec_len = tux_rec_len_to_disk(rec_len - name_len); + entry->rec_len = tux_rec_len_to_disk(name_len); + entry = newent; + } + entry->name_len = len; + memcpy(entry->name, name, len); + offset = (void *)entry - bufdata(buffer); + int err = tux_update_entry(buffer, entry, inum, mode); + if (err) + return err; + return (block << blockbits) + offset; /* only needed for xattr create */ +} + +tux_dirent *_tux_find_entry(struct inode *dir, const char *name, int len, struct buffer_head **result, loff_t size) +{ + unsigned reclen = TUX_REC_LEN(len); + unsigned blocksize = 1 << tux_sb(dir->i_sb)->blockbits; + unsigned blocks = size >> tux_sb(dir->i_sb)->blockbits, block; + int err = -ENOENT; + for (block = 0; block < blocks; block++) { + struct buffer_head *buffer = blockread(mapping(dir), block); + if (!buffer) { + err = -EIO; // need ERR_PTR for blockread!!! + goto error; + } + tux_dirent *entry = bufdata(buffer); + tux_dirent *limit = (void *)entry + blocksize - reclen; + while (entry <= limit) { + if (entry->rec_len == 0) { + brelse(buffer); + tux_error(dir->i_sb, "zero length entry at <%Lx:%x>", (L)tux_inode(dir)->inum, block); + err = -EIO; + goto error; + } + if (tux_match(entry, name, len)) { + *result = buffer; + return entry; + } + entry = next_entry(entry); + } + brelse(buffer); + } +error: + *result = NULL; /* for debug */ + return ERR_PTR(err); +} + +loff_t tux_create_entry(struct inode *dir, const char *name, int len, inum_t inum, unsigned mode) +{ + return _tux_create_entry(dir, name, len, inum, mode, &dir->i_size); +} + +tux_dirent *tux_find_entry(struct inode *dir, const char *name, int len, struct buffer_head **result) +{ + return _tux_find_entry(dir, name, len, result, dir->i_size); +} + +static unsigned char filetype[TUX_TYPES] = { + [TUX_UNKNOWN] = DT_UNKNOWN, + [TUX_REG] = DT_REG, + [TUX_DIR] = DT_DIR, + [TUX_CHR] = DT_CHR, + [TUX_BLK] = DT_BLK, + [TUX_FIFO] = DT_FIFO, + [TUX_SOCK] = DT_SOCK, + [TUX_LNK] = DT_LNK, +}; + +int tux_readdir(struct file *file, void *state, filldir_t filldir) +{ + loff_t pos = file->f_pos; +#ifdef __KERNEL__ + struct inode *dir = file->f_dentry->d_inode; +#else + struct inode *dir = file->f_inode; +#endif + int revalidate = file->f_version != dir->i_version; + unsigned blockbits = tux_sb(dir->i_sb)->blockbits; + unsigned blocksize = 1 << blockbits; + unsigned blockmask = blocksize - 1; + unsigned blocks = dir->i_size >> blockbits; + unsigned offset = pos & blockmask; + for (unsigned block = pos >> blockbits ; block < blocks; block++) { + struct buffer_head *buffer = blockread(mapping(dir), block); + if (!buffer) + return -EIO; + void *base = bufdata(buffer); + if (revalidate) { + if (offset) { + tux_dirent *entry = base + offset; + tux_dirent *p = base + (offset & blockmask); + while (p < entry && p->rec_len) + p = next_entry(p); + offset = (void *)p - base; + file->f_pos = (block << blockbits) + offset; + } + file->f_version = dir->i_version; + revalidate = 0; + } + unsigned size = dir->i_size - (block << blockbits); + tux_dirent *limit = base + (size > blocksize ? blocksize : size) - TUX_REC_LEN(1); + for (tux_dirent *entry = base + offset; entry <= limit; entry = next_entry(entry)) { + if (entry->rec_len == 0) { + brelse(buffer); + tux_error(dir->i_sb, "zero length entry at <%Lx:%x>", (L)tux_inode(dir)->inum, block); + return -EIO; + } + if (!is_deleted(entry)) { + unsigned type = (entry->type < TUX_TYPES) ? filetype[entry->type] : DT_UNKNOWN; + int lame = filldir( + state, entry->name, entry->name_len, + (block << blockbits) | ((void *)entry - base), + from_be_u64(entry->inum), type); + if (lame) { + brelse(buffer); + return 0; + } + } + file->f_pos += tux_rec_len_from_disk(entry->rec_len); + } + brelse(buffer); + offset = 0; + } + return 0; +} + +int tux_delete_entry(struct buffer_head *buffer, tux_dirent *entry) +{ + struct inode *dir = buffer_inode(buffer); + tux_dirent *prev = NULL, *this = bufdata(buffer); + while ((char *)this < (char *)entry) { + if (this->rec_len == 0) { + brelse(buffer); + tux_error(dir->i_sb, "zero-length directory entry"); + return -EIO; + } + prev = this; + this = next_entry(this); + } + if (prev) + prev->rec_len = tux_rec_len_to_disk((void *)entry + + tux_rec_len_from_disk(entry->rec_len) - (void *)prev); + memset(entry->name, 0, entry->name_len); + entry->name_len = entry->type = 0; + entry->inum = 0; + brelse_dirty(buffer); + dir->i_ctime = dir->i_mtime = gettime(); + mark_inode_dirty(dir); + return 0; +} + +int tux_dir_is_empty(struct inode *dir) +{ + unsigned blockbits = tux_sb(dir->i_sb)->blockbits; + unsigned blocks = dir->i_size >> blockbits, blocksize = 1 << blockbits; + be_u64 self = to_be_u64(tux_inode(dir)->inum); + struct buffer_head *buffer; + + for (unsigned block = 0; block < blocks; block++) { + buffer = blockread(mapping(dir), block); + if (!buffer) + return -EIO; + + tux_dirent *entry = bufdata(buffer); + tux_dirent *limit = bufdata(buffer) + blocksize - TUX_REC_LEN(1); + for (; entry <= limit; entry = next_entry(entry)) { + if (!entry->rec_len) { + brelse(buffer); + tux_error(dir->i_sb, "zero length entry at <%Lx:%x>", (L)tux_inode(dir)->inum, block); + return -EIO; + } + if (is_deleted(entry)) + continue; + if (entry->name[0] != '.') + goto not_empty; + if (entry->name_len > 2) + goto not_empty; + if (entry->name_len < 2) { + if (entry->inum != self) + goto not_empty; + } else if (entry->name[1] != '.') + goto not_empty; + } + brelse(buffer); + } + return 0; +not_empty: + brelse(buffer); + return -ENOTEMPTY; +} diff --git a/fs/tux3/dleaf.c b/fs/tux3/dleaf.c new file mode 100644 index 0000000..23d381a --- /dev/null +++ b/fs/tux3/dleaf.c @@ -0,0 +1,827 @@ +/* + * File index btree leaf operations + * + * Original copyright (c) 2008 Daniel Phillips + * Licensed under the GPL version 3 + * + * By contributing changes to this file you grant the original copyright holder + * the right to distribute those changes under any license. + */ + +#include "tux3.h" + +#ifndef trace +#define trace trace_on +#endif + +/* + * Leaf index format + * + * A leaf has a small header followed by a table of extents. A two level + * index grows down from the top of the leaf towards the top of the extent + * table. The index maps each unique logical address in the leaf to one or + * more extents beginning at that address. + * + * The top level index is a table of groups of entries all having the same + * high 24 bits of logical address which is only stored once, along with the + * 8 bit count of entries in the group. Since there can be more than 256 + * entries at the same logical address, there could be more than one group + * with the same logical address. The group count is used both to know the + * number of entries in the group and to find the beginning of the entry table + * for a given group, by adding up the sizes of the proceeding groups. + * + * The 8 bit entry limit limits the number of different versions at the same + * logical address to 255. For now. + * + * The second level entry tables are stored end to end in reverse immediately + * below the groups table, also stored in reverse. Each entry has the low 24 + * bits of the logical address and the 8 bit 'limit' offset of the last extent + * for that logical address, measuring from the first extent for the group in + * units of extent size. The limit is used rather than an offset so that the + * final offset is the count of extents in the group, which is summed up to + * locate the first extent for the group in the extent table. The difference + * between and entry limit and the limit of its predecessor gives the count of + * extents for the logical address specified by the entry. + * + * At the top level of a very large or very sparse btree it is likely that the + * group table will be relatively larger, up to the same size as all the entry + * tables. This does not matter much in terms of overall btree bulk. A few + * levels down the logical address space will have been split to the point + * where most entries in a leaf fit into one entry table. + * + * This leaf indexing scheme has some obscure boundary conditions, such as + * the zeroth entry of a group having no predecessor and thus needing to have + * a special check to supply zero as the preceding limit. Inserting and + * deleting are fairly involved and subtle. But the space required to index + * extents in a deep btree is reduced considerably, which is compelling. In + * the end, the indexing scheme provides access to a simple linear table of + * extents and a count, so there is little impact on the specialized methods + * that operate on those extents due to the complexity of the indexing scheme. + * The lookup operation on this index is very efficient. Each level of the + * index is suited to binary search. A sequence of inserts in ascending order + * in the same group requires no existing entries to be relocated, the reason + * the entry list is stored in reverse. + */ + +static inline struct dleaf *to_dleaf(vleaf *leaf) +{ + return leaf; +} + +static void dleaf_btree_init(struct btree *btree) +{ + btree->entries_per_leaf = 64; /* FIXME: should depend on blocksize */ +} + +int dleaf_init(struct btree *btree, vleaf *leaf) +{ + *to_dleaf(leaf) = (struct dleaf){ + .magic = to_be_u16(0x1eaf), + .free = to_be_u16(sizeof(struct dleaf)), + .used = to_be_u16(btree->sb->blocksize) }; + return 0; +} + +static int dleaf_sniff(struct btree *btree, vleaf *leaf) +{ + return from_be_u16(to_dleaf(leaf)->magic) == 0x1eaf; +} + +unsigned dleaf_free(struct btree *btree, vleaf *leaf) +{ + return from_be_u16(to_dleaf(leaf)->used) - from_be_u16(to_dleaf(leaf)->free); +} + +unsigned dleaf_need(struct btree *btree, vleaf *vleaf) +{ + struct dleaf *leaf = to_dleaf(vleaf); + return btree->sb->blocksize - dleaf_free(btree, leaf) - sizeof(struct dleaf); +} + +static inline tuxkey_t get_index(struct group *group, struct entry *entry) +{ + return ((tuxkey_t)group_keyhi(group) << 24) | entry_keylo(entry); +} + +void dleaf_dump(struct btree *btree, vleaf *vleaf) +{ + unsigned blocksize = btree->sb->blocksize; + struct dleaf *leaf = vleaf; + struct group *gdict = (void *)leaf + blocksize, *gbase = --gdict - dleaf_groups(leaf); + struct entry *edict = (void *)(gbase + 1), *entry = edict; + struct diskextent *extents = leaf->table; + + printf("%i entry groups:\n", dleaf_groups(leaf)); + for (struct group *group = gdict; group > gbase; group--) { + printf(" %ti/%i:", gdict - group, group_count(group)); + //printf(" [%i]", extents - leaf->table); + struct entry *ebase = entry - group_count(group); + while (entry > ebase) { + --entry; + unsigned offset = entry == edict - 1 ? 0 : entry_limit(entry + 1); + int count = entry_limit(entry) - offset; + printf(" %Lx =>", (L)get_index(group, entry)); + //printf(" %p (%i)", entry, entry_limit(entry)); + if (count < 0) + printf(" "); + else for (int i = 0; i < count; i++) { + struct diskextent extent = extents[offset + i]; + printf(" %Lx", (L)extent_block(extent)); + if (extent_count(extent)) + printf("/%x", extent_count(extent)); + } + //printf(" {%u}", entry_limit(entry)); + printf(";"); + } + printf("\n"); + extents += entry_limit(entry); + edict -= group_count(group); + } +} + +static int dleaf_free2(struct dleaf *leaf, unsigned blocksize) +{ + struct group *gdict = (void *)leaf + blocksize, *gstop = gdict - dleaf_groups(leaf); + struct entry *edict = (void *)gstop, *entry = edict; + struct diskextent *extents = leaf->table; + for (struct group *group = gdict; group-- > gstop;) + extents += entry_limit(entry -= group_count(group)); + return (void *)entry - (void *)extents; +} + +static int dleaf_check(struct dleaf *leaf, unsigned blocksize) +{ + struct group *gdict = (void *)leaf + blocksize, *gstop = gdict - dleaf_groups(leaf); + struct entry *edict = (void *)gstop, *estop = edict; + struct diskextent *extents = leaf->table; + unsigned excount = 0, encount = 0; + char *why; + + if (!dleaf_groups(leaf)) + return 0; + + unsigned keyhi = 0; + struct diskextent *exbase = leaf->table; + for (struct group *group = gdict - 1; group >= gstop; group--) { + assert(group_keyhi(group) >= keyhi); + assert(group_count(group) > 0); + assert(group_count(group) <= MAX_GROUP_ENTRIES); + keyhi = group_keyhi(group); + struct entry *entry = estop; + estop -= group_count(group); + unsigned limit = 0, keylo = -1; + while (--entry >= estop) { + assert((int)entry_keylo(entry) > (int)keylo); + assert(entry_limit(entry) > limit); + keylo = entry_keylo(entry); + limit = entry_limit(entry); + } + struct diskextent *exstop = exbase + entry_limit(estop); + block_t block = 0; + while (exbase < exstop) { + assert(extent_block(*exbase) != block); + exbase++; + } + excount += entry_limit(estop); + encount += group_count(group); + } + //printf("encount = %i, excount = %i, \n", encount, excount); + why = "used count wrong"; + if (from_be_u16(leaf->used) != (void *)(edict - encount) - (void *)leaf) + goto eek; + why = "free count wrong"; + if (from_be_u16(leaf->free) != (void *)(extents + excount) - (void *)leaf) + goto eek; + why = "free check mismatch"; + if (from_be_u16(leaf->used) - from_be_u16(leaf->free) != dleaf_free2(leaf, blocksize)) + goto eek; + return 0; +eek: + printf("free %i, used %i\n", from_be_u16(leaf->free), from_be_u16(leaf->used)); + printf("%s!\n", why); + return -1; +} + +int dleaf_split_at(vleaf *from, vleaf *into, struct entry *entry, unsigned blocksize) +{ + struct dleaf *leaf = from, *leaf2 = into; + unsigned groups = dleaf_groups(leaf), groups2; + struct group *gdict = from + blocksize, *gbase = gdict - groups; + struct entry *edict = (void *)gbase, *ebase = (void *)leaf + from_be_u16(leaf->used); + unsigned recount = 0, grsplit = 0, exsplit = 0; + unsigned entries = edict - ebase, split = edict - 1 - entry; + + printf("split %p into %p at %x\n", leaf, leaf2, split); + if (!groups) + return 0; + assert(ebase <= entry && entry < edict); + assert(split < entries); + for (struct group *group = gdict - 1; group >= gbase; group--, grsplit++) { + if (recount + group_count(group) > split) + break; + edict -= group_count(group); + exsplit += entry_limit(edict); + recount += group_count(group); + } + + /* have to split a group? */ + unsigned cut = split - recount; + if (cut) + exsplit += entry_limit(edict - cut); + edict = (void *)gbase; /* restore it */ + printf("split %i entries at group %i, entry %x\n", entries, grsplit, cut); + printf("split extents at %i\n", exsplit); + /* copy extents */ + unsigned size = from + from_be_u16(leaf->free) - (void *)(leaf->table + exsplit); + memcpy(leaf2->table, leaf->table + exsplit, size); + + /* copy groups */ + struct group *gdict2 = (void *)leaf2 + blocksize; + set_dleaf_groups(leaf2, groups2 = (groups - grsplit)); + veccopy(gdict2 - dleaf_groups(leaf2), gbase, dleaf_groups(leaf2)); + inc_group_count(gdict2 - 1, -cut); + set_dleaf_groups(leaf, groups = (grsplit + !!cut)); + gbase = gdict - groups; + if (cut) + set_group_count(gdict - groups, cut); + + /* copy entries */ + struct entry *edict2 = (void *)(gdict2 - groups2); + + assert((struct entry *)((void *)leaf + from_be_u16(leaf->used)) == edict - entries); + + unsigned encopy = entries - split; + veccopy(edict2 - encopy, ebase, encopy); + if (cut) + for (int i = 1; i <= group_count((gdict2 - 1)); i++) + inc_entry_limit(edict2 - i, -entry_limit(edict - split)); + vecmove(gdict - groups - split, edict - split, split); + + /* clean up */ + leaf->free = to_be_u16((void *)(leaf->table + exsplit) - from); + leaf->used = to_be_u16((void *)(gbase - split) - from); + leaf2->free = to_be_u16((void *)leaf->table + size - from); + leaf2->used = to_be_u16((void *)(gdict - groups2 - encopy) - from); + memset(from + from_be_u16(leaf->free), 0, from_be_u16(leaf->used) - from_be_u16(leaf->free)); + assert(!dleaf_check(leaf, blocksize)); + assert(!dleaf_check(leaf2, blocksize)); + return groups2; +} + +/* + * Split dleaf at middle in terms of entries, may be unbalanced in extents. + * Not used for now because we do the splits by hand in filemap.c + */ +static tuxkey_t dleaf_split(struct btree *btree, tuxkey_t key, vleaf *from, vleaf *into) +{ + struct dleaf *leaf = to_dleaf(from), *leaf2 = to_dleaf(into); + assert(dleaf_sniff(btree, from)); + unsigned blocksize = btree->sb->blocksize; + struct group *gdict = from + blocksize, *gbase = gdict - dleaf_groups(leaf); + struct entry *edict = (void *)gbase; + struct entry *ebase = (void *)leaf + from_be_u16(leaf->used); + unsigned entries = edict - ebase; + unsigned groups2 = dleaf_split_at(from, into, edict - entries / 2, blocksize); + struct group *gdict2 = (void *)leaf2 + blocksize; + return get_index(gdict2 - 1, (struct entry *)(gdict2 - groups2) - 1); +} + +void dleaf_merge(struct btree *btree, vleaf *vinto, vleaf *vfrom) +{ + struct dleaf *leaf = to_dleaf(vinto), *from = to_dleaf(vfrom); + struct group *gdict = (void *)leaf + btree->sb->blocksize; + struct group *gstop = gdict - dleaf_groups(leaf); + struct entry *edict = (struct entry *)gstop; + unsigned free = from_be_u16(leaf->free); + struct group *gdict2 = (void *)from + btree->sb->blocksize; + struct group *group2 = gdict2 - 1; + struct group *gstop2 = gdict2 - dleaf_groups(from); + struct entry *edict2 = (struct entry *)gstop2; + unsigned merge_gcount = 0, rest_gcount = 0; + int can_merge_group = 0; + + assert(dleaf_groups(leaf) >= 1); + if (dleaf_groups(from) == 0) + return; + + /* Try to merge group, and prepare to adjust */ + if (group_keyhi(gstop) == group_keyhi(group2) && + group_count(gstop) < MAX_GROUP_ENTRIES) { + unsigned gcount2 = group_count(group2); + unsigned room = MAX_GROUP_ENTRIES - group_count(gstop); + /* All entries can merge to this group? */ + if (room < gcount2) { + /* Calc adjust for the rest of group/entries */ + rest_gcount = gcount2 - room; + gcount2 = room; + } else + can_merge_group = 1; + + /* group/entries which merge */ + merge_gcount = gcount2; + } + + /* append extents */ + unsigned size = from_be_u16(from->free) - sizeof(struct dleaf); + memcpy((void *)leaf + free, from->table, size); + leaf->free = to_be_u16(free + size); + + /* make space and append groups except for possibly merged group */ + assert(sizeof(struct group) == sizeof(struct entry)); + unsigned addgroups = dleaf_groups(from) - can_merge_group; + struct entry *ebase2 = (void *)from + from_be_u16(from->used); + struct entry *ebase = (void *)leaf + from_be_u16(leaf->used); + vecmove(ebase - addgroups, ebase, edict - ebase); + veccopy(gstop - addgroups, gstop2, addgroups); + ebase -= addgroups; + inc_dleaf_groups(leaf, addgroups); + + /* append entries */ + size = (void *)edict2 - (void *)ebase2; + memcpy((void *)ebase - size, ebase2, size); + leaf->used = to_be_u16((void *)ebase - size - (void *)leaf); + + if (merge_gcount) { + /* adjust merged group */ + struct entry *estop = ebase - merge_gcount; + unsigned limit_adjust = entry_limit(ebase); + inc_group_count(gstop, merge_gcount); + while (--ebase >= estop) + inc_entry_limit(ebase, limit_adjust); + if (rest_gcount) { + /* adjust the group/entries which couldn't merge */ + ebase = estop; + estop = ebase - rest_gcount; + limit_adjust = entry_limit(edict2 - merge_gcount); + set_group_count(gstop - 1, rest_gcount); + while (--ebase >= estop) + inc_entry_limit(ebase, -limit_adjust); + } + } + assert(!dleaf_check(leaf, btree->sb->blocksize)); +} + +/* + * dleaf format and dwalk structure + * + * min address +--------------------------+ + * | dleaf header | + * | | extent <0> (gr 0, ent 0) | __ walk->exbase + * growing downwards | | extent <0> (gr 1, ent 0) | __ walk->extent + * | | extent <1> (gr 1, ent 1) | __ walk->exstop + * V | extent <2> (gr 1, ent 2) | + * | | + * | ....... | + * | | __ walk->estop + * | entry <2> (gr 1) | + * | entry <1> (gr 1) | __ walk->entry + * ^ | entry <0> (gr 1) | + * | | entry <0> (gr 0) | __ walk->group,walk->gstop + * growing upwards | | group <1> | + * | | group <0> | + * max address +--------------------------+ __ walk->gdict + * + * The above is dleaf format, and now dwalk_next() was called 2 times. + * + * ->gdict is the end of dleaf. + * ->group is the current group (group <1>) + * ->gstop is the last group in this dleaf + * ->entry is the current entry (entry <0> (gr 1)) + * ->estop is the last entry in current group + * ->exbase is the first extent in current group + * ->extent is the current extent (extent <1> (gr1, ent 1)). + * ->exstop is the first extent in next entry. + * (I.e. the address that dwalk_next() has to update to next entry. + * If there is no next, it will stop with ->extent == ->exstop.) + */ + +/* FIXME: current code is assuming the entry has only one extent. */ + +/* The first extent in dleaf */ +static int dwalk_first(struct dwalk *walk) +{ + return walk->leaf->table == walk->extent; +} + +/* The end of extent in dleaf */ +int dwalk_end(struct dwalk *walk) +{ + return walk->extent == walk->exstop; +} + +tuxkey_t dwalk_index(struct dwalk *walk) +{ + return get_index(walk->group, walk->entry); +} + +block_t dwalk_block(struct dwalk *walk) +{ + return extent_block(*walk->extent); +} + +unsigned dwalk_count(struct dwalk *walk) +{ + return extent_count(*walk->extent); +} + +/* unused */ +void dwalk_dump(struct dwalk *walk) +{ + if (walk->leaf->table == walk->exstop) { + trace_on("empty leaf"); + return; + } + if (dwalk_end(walk)) { + trace_on("end of extent"); + return; + } + struct diskextent *entry_exbase; + if (walk->entry + 1 == walk->estop + group_count(walk->group)) + entry_exbase = walk->exbase; + else + entry_exbase = walk->exbase + entry_limit(walk->entry + 1); + trace_on("leaf %p", walk->leaf); + trace_on("group %tu/%tu", (walk->gdict - walk->group) - 1, walk->gdict - walk->gstop); + trace_on("entry %tu/%u", group_count(walk->group) - (walk->entry - walk->estop) - 1, group_count(walk->group)); + trace_on("extent %tu/%tu", walk->extent - entry_exbase, walk->exstop - entry_exbase); +} + +static void dwalk_check(struct dwalk *walk) +{ + if (!dleaf_groups(walk->leaf)) { + assert(walk->group == walk->gstop); + assert(walk->entry == walk->estop); + assert(walk->exbase == walk->extent); + assert(walk->extent == walk->exstop); + assert(walk->leaf->table == walk->exstop); + } else if (dwalk_end(walk)) { + assert(walk->group == walk->gstop); + assert(walk->entry == walk->estop); + assert(walk->exbase < walk->extent); + assert(walk->extent == walk->exstop); + } else { + assert(walk->group >= walk->gstop); + assert(walk->entry >= walk->estop); + assert(walk->exbase <= walk->extent); + assert(walk->extent < walk->exstop); + } +} + +/* Set the cursor to next extent */ +int dwalk_next(struct dwalk *walk) +{ + trace(" "); + /* last extent of this dleaf, or empty dleaf */ + if (dwalk_end(walk)) + return 0; + walk->extent++; + if (walk->extent == walk->exstop) { + if (walk->entry == walk->estop) { + if (walk->group == walk->gstop) + return 0; + walk->group--; + walk->exbase += entry_limit(walk->estop); + walk->estop -= group_count(walk->group); + } + walk->entry--; + walk->exstop = walk->exbase + entry_limit(walk->entry); + } + dwalk_check(walk); + return 1; +} + +/* Back to the previous extent. (i.e. rewind the previous dwalk_next()) */ +int dwalk_back(struct dwalk *walk) +{ + trace(" "); + /* first extent of this dleaf, or empty dleaf */ + if (dwalk_first(walk)) + return 0; + struct diskextent *entry_exbase; + if (walk->entry + 1 == walk->estop + group_count(walk->group)) + entry_exbase = walk->exbase; + else + entry_exbase = walk->exbase + entry_limit(walk->entry + 1); + walk->extent--; + if (walk->extent < entry_exbase) { + if (walk->extent < walk->exbase) { + if (walk->group == walk->gdict) + return 1; + walk->group++; + walk->estop = walk->entry + 1; + walk->exbase -= entry_limit(walk->entry + 1); + } + walk->entry++; + walk->exstop = walk->exbase + entry_limit(walk->entry); + } + dwalk_check(walk); + return 1; +} + +/* + * Probe the extent position with key. If not found, position is next + * extent of key. If probed all extents return 0, otherwise return 1 + * (I.e. current extent is valid. IOW, !dwalk_end()). + */ +int dwalk_probe(struct dleaf *leaf, unsigned blocksize, struct dwalk *walk, tuxkey_t key) +{ + trace("probe for 0x%Lx", (L)key); + unsigned keylo = key & 0xffffff, keyhi = key >> 24; + + walk->leaf = leaf; + walk->gdict = (void *)leaf + blocksize; + walk->gstop = walk->gdict - dleaf_groups(leaf); + walk->group = walk->gdict; + walk->estop = (struct entry *)walk->gstop; + walk->exbase = leaf->table; + if (!dleaf_groups(leaf)) { + /* dwalk_first() and dwalk_end() will return true */ + walk->entry = (struct entry *)walk->gstop; + walk->extent = leaf->table; + walk->exstop = leaf->table; + dwalk_check(walk); + return 0; + } + + while (walk->group > walk->gstop) { + walk->group--; + walk->entry = walk->estop - 1; + walk->estop -= group_count(walk->group); + if (group_keyhi(walk->group) > keyhi) + goto no_group; + if (group_keyhi(walk->group) == keyhi) { + if (entry_keylo(walk->entry) > keylo) + goto no_group; + if (walk->group == walk->gstop) + goto probe_entry; + if (group_keyhi(walk->group - 1) > keyhi) + goto probe_entry; + if (entry_keylo(walk->estop - 1) > keylo) + goto probe_entry; + } + walk->exbase += entry_limit(walk->estop); + } + /* There is no group after this key */ + walk->entry = walk->estop; + walk->exstop = walk->exbase; + walk->extent = walk->exbase; + walk->exbase = walk->exbase - entry_limit(walk->estop); + dwalk_check(walk); + return 0; + +no_group: + /* There is no interesting group, set first extent in this group */ + walk->extent = walk->exbase; + walk->exstop = walk->exbase + entry_limit(walk->entry); + dwalk_check(walk); + return 1; + +probe_entry: + /* There is interesting group, next is probe interesting entry */ + walk->extent = walk->exbase; + walk->exstop = walk->exbase + entry_limit(walk->entry); + while (walk->entry > walk->estop) { + if (entry_keylo(walk->entry - 1) > keylo) + break; + walk->entry--; + walk->extent = walk->exstop; + walk->exstop = walk->exbase + entry_limit(walk->entry); + } + /* Now, entry has the nearest keylo (<= key), probe extent */ + /* FIXME: this is assuming the entry has only one extent */ + if (key < dwalk_index(walk) + dwalk_count(walk)) + return 1; + /* This entry didn't have the target extent, set next entry */ + dwalk_next(walk); + return !dwalk_end(walk); +} + +int dwalk_mock(struct dwalk *walk, tuxkey_t index, struct diskextent extent) +{ + if (!dleaf_groups(walk->leaf) || walk->entry == walk->estop || dwalk_index(walk) != index) { + trace("add entry 0x%Lx", (L)index); + unsigned keylo = index & 0xffffff, keyhi = index >> 24; + if (!walk->mock.groups || group_keyhi(&walk->mock.group) != keyhi || group_count(&walk->mock.group) >= MAX_GROUP_ENTRIES) { + trace("add group %i", walk->mock.groups); + walk->exbase += entry_limit(&walk->mock.entry); + walk->mock.group = make_group(keyhi, 0); + walk->mock.used -= sizeof(struct group); + walk->mock.groups++; + } + walk->mock.used -= sizeof(struct entry); + walk->mock.entry = make_entry(keylo, walk->extent - walk->exbase); + inc_group_count(&walk->mock.group, 1); + } + trace("add extent 0x%Lx => 0x%Lx/%x", (L)index, (L)extent_block(extent), extent_count(extent)); + walk->mock.free += sizeof(*walk->extent); + walk->extent++; + inc_entry_limit(&walk->mock.entry, 1); + return 0; +} + +/* This copy extents >= this extent to another dleaf. */ +void dwalk_copy(struct dwalk *walk, struct dleaf *dest) +{ + struct dleaf *leaf = walk->leaf; + unsigned blocksize = (void *)walk->gdict - (void *)leaf; + + assert(dleaf_groups(dest) == 0); + if (dwalk_end(walk)) + return; + if (dwalk_first(walk)) { + memcpy(dest, leaf, blocksize); + return; + } + + struct group *gdict2 = (void *)dest + blocksize; + unsigned groups2 = walk->group + 1 - walk->gstop; + struct entry *ebase = (void *)leaf + from_be_u16(leaf->used); + unsigned entries = (walk->entry + 1) - ebase; + struct entry *edict2 = (struct entry *)(gdict2 - groups2); + struct diskextent *exend = (void *)leaf + from_be_u16(leaf->free); + struct diskextent *entry_exbase; + unsigned limit_adjust, extents; + + if (walk->entry + 1 == walk->estop + group_count(walk->group)) { + entry_exbase = walk->exbase; + limit_adjust = 0; + } else { + entry_exbase = walk->exbase + entry_limit(walk->entry + 1); + limit_adjust = entry_limit(walk->entry + 1); + } + extents = exend - entry_exbase; + + veccopy(gdict2 - groups2, walk->gstop, groups2); + veccopy(edict2 - entries, ebase, entries); + veccopy(dest->table, entry_exbase, extents); + + unsigned gcount2 = (walk->entry + 1) - walk->estop; + set_dleaf_groups(dest, groups2); + dest->free = to_be_u16((void *)(dest->table + extents) - (void *)dest); + dest->used = to_be_u16((void *)(edict2 - entries) - (void *)dest); + set_group_count(gdict2 - 1, gcount2); + struct entry *entry2 = edict2 - 1, *estop2 = edict2 - gcount2; + while (entry2 >= estop2) { + inc_entry_limit(entry2, -limit_adjust); + entry2--; + } + assert(!dleaf_check(dest, blocksize)); +} + +/* This removes extents >= this extent. (cursor position is dwalk_end()). */ +void dwalk_chop(struct dwalk *walk) +{ + trace(" "); + if (dwalk_end(walk)) + return; + + struct dleaf *leaf = walk->leaf; + if (dwalk_first(walk)) { + unsigned blocksize = (void *)walk->gdict - (void *)leaf; + set_dleaf_groups(leaf, 0); + leaf->free = to_be_u16(sizeof(struct dleaf)); + leaf->used = to_be_u16(blocksize); + /* Initialize dwalk state */ + dwalk_probe(leaf, blocksize, walk, 0); + return; + } + + /* If extent is first extent on this group, remove this group too */ + dwalk_back(walk); + + struct entry *ebase = walk->estop + group_count(walk->group); + void *entry = walk->entry; + set_dleaf_groups(leaf, walk->gdict - walk->group); + set_group_count(walk->group, ebase - walk->entry); + entry += (void *)walk->group - (void *)walk->gstop; + memmove(entry, walk->entry, (void *)walk->gstop - (void *)walk->entry); + walk->estop = walk->entry = entry; + walk->gstop = walk->group; + walk->exstop = walk->exbase + entry_limit(walk->entry); + walk->extent = walk->exstop; + leaf->free = to_be_u16((void *)walk->exstop - (void *)leaf); + leaf->used = to_be_u16((void *)walk->estop - (void *)leaf); + dwalk_check(walk); + assert(!dleaf_check(leaf, (void *)walk->gdict - (void *)leaf)); +} + +/* + * Add extent to dleaf. This can use only if dwalk_end() is true. + * Note, dwalk state is invalid after this. (I.e. it can be used only + * for dwalk_add()) + */ +int dwalk_add(struct dwalk *walk, tuxkey_t index, struct diskextent extent) +{ + struct dleaf *leaf = walk->leaf; + unsigned groups = dleaf_groups(leaf); + unsigned free = from_be_u16(leaf->free); + unsigned used = from_be_u16(leaf->used); + + /* FIXME: assume entry has only one extent */ + assert(!groups || dwalk_index(walk) != index); + assert(extent_block(extent) > 0 && extent_count(extent) > 0); + + trace("group %ti/%i", walk->gstop + groups - 1 - walk->group, groups); + if (!groups || dwalk_index(walk) != index) { + trace("add entry 0x%Lx", (L)index); + unsigned keylo = index & 0xffffff, keyhi = index >> 24; + if (!groups || group_keyhi(walk->group) != keyhi || group_count(walk->group) >= MAX_GROUP_ENTRIES) { + trace("add group %i", groups); + /* will it fit? */ + assert(sizeof(*walk->entry) == sizeof(*walk->group)); + assert(free <= used - sizeof(*walk->entry)); + /* move entries down, adjust walk state */ + /* could preplan this to avoid move: need additional pack state */ + vecmove(walk->entry - 1, walk->entry, (struct entry *)walk->group - walk->entry); + walk->entry--; /* adjust to moved position */ + walk->exbase += groups ? entry_limit(walk->entry) : 0; + *--walk->group = make_group(keyhi, 0); + used -= sizeof(*walk->group); + set_dleaf_groups(leaf, ++groups); + } + assert(free <= used - sizeof(*walk->entry)); + used -= sizeof(*walk->entry); + leaf->used = to_be_u16(used); + *--walk->entry = make_entry(keylo, walk->extent - walk->exbase); + inc_group_count(walk->group, 1); + } + trace("add extent %ti", walk->extent - leaf->table); + assert(free + sizeof(*walk->extent) <= used); + free += sizeof(*walk->extent); + leaf->free = to_be_u16(free); + *walk->extent++ = extent; + inc_entry_limit(walk->entry, 1); + + assert(!dleaf_check(leaf, (void *)walk->gdict - (void *)walk->leaf)); + + return 0; // extent out of order??? leaf full??? +} + +/* Update this extent. The caller have to check new extent isn't overlapping. */ +static void dwalk_update(struct dwalk *walk, struct diskextent extent) +{ + *walk->extent = extent; +} + +/* + * Reasons this dleaf truncator sucks: + * + * * Does not check for integrity at all so a corrupted leaf can cause overflow + * and system corruption. + * + * * Assumes all block pointers after the truncation point will be deleted, + * which does not hold when versions arrive. + * + * * Modifies a group count in the middle of the traversal knowing that it has + * already loaded the changed field and will not load it again, fragile. + * + * * Does not provide a generic mechanism that can be adapted to other + * truncation tasks. + * + * But it does truncate so it is getting checked in just for now. + */ +static int dleaf_chop(struct btree *btree, tuxkey_t chop, vleaf *vleaf) +{ + struct sb *sb = btree->sb; + struct dleaf *leaf = to_dleaf(vleaf); + struct dwalk walk; + + if (!dwalk_probe(leaf, sb->blocksize, &walk, chop)) + return 0; + + /* Chop this extent partially */ + if (dwalk_index(&walk) < chop) { + block_t block = dwalk_block(&walk); + unsigned count = chop - dwalk_index(&walk); + + /* FIXME: err check? */ + (btree->ops->bfree)(sb, block + count, dwalk_count(&walk) - count); + dwalk_update(&walk, make_extent(block, count)); + if (!dwalk_next(&walk)) + goto out; + } + struct dwalk rewind = walk; + do { + /* FIXME: err check? */ + (btree->ops->bfree)(sb, dwalk_block(&walk), dwalk_count(&walk)); + } while (dwalk_next(&walk)); + dwalk_chop(&rewind); +out: + assert(!dleaf_check(leaf, sb->blocksize)); + return 1; +} + +struct btree_ops dtree_ops = { + .btree_init = dleaf_btree_init, + .leaf_sniff = dleaf_sniff, + .leaf_init = dleaf_init, + .leaf_dump = dleaf_dump, + .leaf_need = dleaf_need, + .leaf_free = dleaf_free, + .leaf_split = dleaf_split, +// .leaf_resize = dleaf_resize, + .leaf_chop = dleaf_chop, + .leaf_merge = dleaf_merge, + .balloc = balloc, + .bfree = bfree, +}; diff --git a/fs/tux3/filemap.c b/fs/tux3/filemap.c new file mode 100644 index 0000000..6297b32 --- /dev/null +++ b/fs/tux3/filemap.c @@ -0,0 +1,432 @@ +#include "tux3.h" + +#ifndef trace +#define trace trace_on +#endif + +#define SEG_HOLE (1 << 0) +#define SEG_NEW (1 << 1) + +struct seg { block_t block; unsigned count; unsigned state; }; + +/* userland only */ +void show_segs(struct seg map[], unsigned segs) +{ + printf("%i segs: ", segs); + for (int i = 0; i < segs; i++) + printf("%Lx/%i ", (L)map[i].block, map[i].count); + printf("\n"); +} + +static int map_region(struct inode *inode, block_t start, unsigned count, struct seg map[], unsigned max_segs, int create) +{ + struct cursor *cursor = alloc_cursor(&tux_inode(inode)->btree, 1); /* allows for depth increase */ + if (!cursor) + return -ENOMEM; + + if (create) + down_write(&cursor->btree->lock); + else + down_read(&cursor->btree->lock); + + assert(max_segs > 0); + block_t limit = start + count; + trace("--- index %Lx, limit %Lx ---", (L)start, (L)limit); + struct btree *btree = cursor->btree; + struct sb *sb = btree->sb; + int err, segs = 0; + + if (!btree->root.depth) + goto out_unlock; + + if ((err = probe(btree, start, cursor))) { + segs = err; + goto out_unlock; + } + //assert(start >= this_key(cursor, btree->root.depth)) + /* do not overlap next leaf */ + if (limit > next_key(cursor, btree->root.depth)) + limit = next_key(cursor, btree->root.depth); + struct dleaf *leaf = bufdata(cursor_leafbuf(cursor)); + dleaf_dump(btree, leaf); + + struct dwalk *walk = &(struct dwalk){ }; + block_t index = start, seg_start, block; + dwalk_probe(leaf, sb->blocksize, walk, start); + struct dwalk headwalk = *walk; + if (!dwalk_end(walk) && dwalk_index(walk) < start) + seg_start = dwalk_index(walk); + else + seg_start = index; + while (index < limit && segs < max_segs) { + block_t ex_index; + if (!dwalk_end(walk)) + ex_index = dwalk_index(walk); + else + ex_index = limit; + + if (index < ex_index) { + /* There is hole */ + ex_index = min(ex_index, limit); + unsigned gap = ex_index - index; + index = ex_index; + map[segs++] = (struct seg){ .count = gap, .state = SEG_HOLE }; + } else { + block = dwalk_block(walk); + count = dwalk_count(walk); + trace("emit %Lx/%x", (L)block, count ); + map[segs++] = (struct seg){ .block = block, .count = count }; + index = ex_index + count; + dwalk_next(walk); + } + } + assert(segs); + unsigned below = start - seg_start, above = index - min(index, limit); + map[0].block += below; + map[0].count -= below; + map[segs - 1].count -= above; + + if (!create) + goto out_release; + + struct dleaf *tail = NULL; + tuxkey_t tailkey = 0; // probably can just use limit instead + + if (!dwalk_end(walk)) { + tail = malloc(sb->blocksize); // error??? + dleaf_init(btree, tail); + tailkey = dwalk_index(walk); + dwalk_copy(walk, tail); + } + + for (int i = 0; i < segs; i++) { + if (map[i].state == SEG_HOLE) { + count = map[i].count; + block = balloc(sb, count); // goal ??? + trace("fill in %Lx/%i ", (L)block, count); + if (block == -1) { + /* + * Out of space on file data allocation. It happens. Tread + * carefully. We have not stored anything in the btree yet, + * so we free what we allocated so far. We need to leave the + * user with a nice ENOSPC return and all metadata consistent + * on disk. We better have reserved everything we need for + * metadata, just giving up is not an option. + */ + /* + * Alternatively, we can go ahead and try to record just what + * we successfully allocated, then if the update fails on no + * space for btree splits, free just the blocks for extents + * we failed to store. + */ + segs = -ENOSPC; + goto out_create; + } + map[i] = (struct seg){ .block = block, .count = count, .state = SEG_NEW, }; + } + } + /* Go back to region start and pack in new segs */ + dwalk_chop(&headwalk); + index = start; + for (int i = -!!below; i < segs + !!above; i++) { + if (dleaf_free(btree, leaf) < 16) { + mark_buffer_dirty(cursor_leafbuf(cursor)); + struct buffer_head *newbuf = new_leaf(btree); + if (!newbuf) { + segs = -ENOMEM; + goto out_create; + } + /* + * ENOSPC on btree index split could leave the cache state + * badly messed up. Going to have to do this in two steps: + * first, look at the cursor to see how many splits we need, + * then make sure we have that, or give up before starting. + */ + btree_insert_leaf(cursor, index, newbuf); + leaf = bufdata(cursor_leafbuf(cursor)); + dwalk_probe(leaf, sb->blocksize, &headwalk, index); + } + if (i < 0) { + trace("emit below"); + dwalk_add(&headwalk, index - below, make_extent(map[0].block - below, below)); + continue; + } + if (i == segs) { + trace("emit above"); + dwalk_add(&headwalk, index, make_extent(map[segs - 1].block + map[segs - 1].count, above)); + continue; + } + trace("pack 0x%Lx => %Lx/%x", (L)index, (L)map[i].block, map[i].count); + dleaf_dump(btree, leaf); + dwalk_add(&headwalk, index, make_extent(map[i].block, map[i].count)); + dleaf_dump(btree, leaf); + index += map[i].count; + } + if (tail) { + if (dleaf_need(btree, tail) < dleaf_free(btree, leaf)) + dleaf_merge(btree, leaf, tail); + else { + mark_buffer_dirty(cursor_leafbuf(cursor)); + assert(dleaf_groups(tail) >= 1); + /* Tail does not fit, add it as a new btree leaf */ + struct buffer_head *newbuf = new_leaf(btree); + if (!newbuf) { + segs = -ENOMEM; + goto out_create; + } + memcpy(bufdata(newbuf), tail, sb->blocksize); + if ((err = btree_insert_leaf(cursor, tailkey, newbuf))) { + free(tail); + segs = err; + goto out_unlock; + } + } + } + mark_buffer_dirty(cursor_leafbuf(cursor)); +out_create: + if (tail) + free(tail); +out_release: + release_cursor(cursor); +out_unlock: + if (create) + up_write(&cursor->btree->lock); + else + up_read(&cursor->btree->lock); + free_cursor(cursor); + return segs; +} + +#ifdef __KERNEL__ +#include + +int tux3_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + trace("==> inum %Lu, iblock %Lu, b_size %zu, create %d", + (L)tux_inode(inode)->inum, (L)iblock, bh_result->b_size, create); + + struct sb *sb = tux_sb(inode->i_sb); + size_t max_blocks = bh_result->b_size >> inode->i_blkbits; + struct btree *btree = &tux_inode(inode)->btree; + int depth = btree->root.depth; + if (!depth) { + warn("Uninitialized inode %lx", inode->i_ino); + return 0; + } + + struct seg seg; + int segs = map_region(inode, iblock, max_blocks, &seg, 1, create); + if (segs < 0) { + warn("map_region failed: %d", -segs); + return -EIO; + } + assert(segs == 1); + size_t blocks = min_t(size_t, max_blocks, seg.count); + if (seg.state == SEG_NEW) { + assert(seg.block); + set_buffer_new(bh_result); + inode->i_blocks += blocks << (sb->blockbits - 9); + } + if (seg.state != SEG_HOLE) { + map_bh(bh_result, inode->i_sb, seg.block); + bh_result->b_size = blocks << sb->blockbits; + } + trace("<== inum %Lu, mapped %d, block %Lu, size %zu", + (L)tux_inode(inode)->inum, buffer_mapped(bh_result), + (L)bh_result->b_blocknr, bh_result->b_size); + + return 0; +} + +static struct buffer_head *find_buffer(struct address_space *mapping, + pgoff_t index, int offset) +{ + struct buffer_head *bh = NULL; + struct page *page; + + page = find_get_page(mapping, index); + if (page) { + spin_lock(&mapping->private_lock); + if (page_has_buffers(page)) { + bh = page_buffers(page); + if (bh){ + while (offset--) + bh = bh->b_this_page; + if (buffer_uptodate(bh)) + get_bh(bh); + else + bh = NULL; + } + } + spin_unlock(&mapping->private_lock); + page_cache_release(page); + } + return bh; +} + +struct buffer_head *blockread(struct address_space *mapping, block_t iblock) +{ + struct inode *inode = mapping->host; + pgoff_t index; + struct page *page; + struct buffer_head *bh; + int offset; + + index = iblock >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + offset = iblock & ((PAGE_CACHE_SHIFT - inode->i_blkbits) - 1); + + /* FIXME: hack */ + bh = find_buffer(mapping, index, offset); + if (bh) + return bh; + + page = read_mapping_page(mapping, index, NULL); + if (IS_ERR(page)) + goto error; + if (PageError(page)) + goto error_page; + + lock_page(page); + + if (!page_has_buffers(page)) + create_empty_buffers(page, tux_sb(inode->i_sb)->blocksize, 0); + + bh = page_buffers(page); + while (offset--) + bh = bh->b_this_page; + get_bh(bh); + + unlock_page(page); + page_cache_release(page); + + return bh; + +error_page: + page_cache_release(page); +error: + return NULL; +} + +struct buffer_head *blockget(struct address_space *mapping, block_t iblock) +{ + struct inode *inode = mapping->host; + pgoff_t index; + struct page *page; + struct buffer_head *bh; + void *fsdata; + int err, offset; + + index = iblock >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + offset = iblock & ((PAGE_CACHE_SHIFT - inode->i_blkbits) - 1); + + err = mapping->a_ops->write_begin(NULL, mapping, + iblock << inode->i_blkbits, + 1 << inode->i_blkbits, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (err) + return NULL; + + if (!page_has_buffers(page)) + create_empty_buffers(page, tux_sb(inode->i_sb)->blocksize, 0); + + bh = page_buffers(page); + while (offset--) + bh = bh->b_this_page; + get_bh(bh); + set_buffer_uptodate(bh); + + unlock_page(page); + page_cache_release(page); + + return bh; +} + +static int tux3_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, tux3_get_block); +} + +static int tux3_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, tux3_get_block); +} + +static int tux3_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + *pagep = NULL; + return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + tux3_get_block); +} + +static int tux3_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, tux3_get_block, wbc); +} + +static int tux3_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, tux3_get_block); +} + +static ssize_t tux3_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, + loff_t offset, unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, tux3_get_block, NULL); +} + +static sector_t tux3_bmap(struct address_space *mapping, sector_t iblock) +{ + sector_t blocknr; + + mutex_lock(&mapping->host->i_mutex); + blocknr = generic_block_bmap(mapping, iblock, tux3_get_block); + mutex_unlock(&mapping->host->i_mutex); + + return blocknr; +} + +const struct address_space_operations tux_aops = { + .readpage = tux3_readpage, + .readpages = tux3_readpages, + .writepage = tux3_writepage, + .writepages = tux3_writepages, + .sync_page = block_sync_page, + .write_begin = tux3_write_begin, + .write_end = generic_write_end, + .bmap = tux3_bmap, +// .invalidatepage = ext4_da_invalidatepage, +// .releasepage = ext4_releasepage, + .direct_IO = tux3_direct_IO, + .migratepage = buffer_migrate_page, +// .is_partially_uptodate = block_is_partially_uptodate, +}; + +static int tux3_blk_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page, tux3_get_block); +} + +static int tux3_blk_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, tux3_get_block, wbc); +} + +const struct address_space_operations tux_blk_aops = { + .readpage = tux3_blk_readpage, + .writepage = tux3_blk_writepage, + .writepages = tux3_writepages, + .sync_page = block_sync_page, + .write_begin = tux3_write_begin, + .bmap = tux3_bmap, +}; +#endif /* __KERNEL__ */ diff --git a/fs/tux3/hexdump.c b/fs/tux3/hexdump.c new file mode 100644 index 0000000..9b409e9 --- /dev/null +++ b/fs/tux3/hexdump.c @@ -0,0 +1,22 @@ +/* Copyright (c) 2008 Daniel Phillips , GPL v3 */ + +#include "tux3.h" + +void hexdump(void *data, unsigned size) +{ + while (size) { + unsigned char *p; + int w = 16, n = size < w? size: w, pad = w - n; + printf("%p: ", data); + for (p = data; p < (unsigned char *)data + n;) + printf("%02hx ", *p++); + printf("%*.s \"", pad*3, ""); + for (p = data; p < (unsigned char *)data + n;) { + int c = *p++; + printf("%c", c < ' ' || c > 127 ? '.' : c); + } + printf("\"\n"); + data += w; + size -= n; + } +} diff --git a/fs/tux3/iattr.c b/fs/tux3/iattr.c new file mode 100644 index 0000000..9fc2b41 --- /dev/null +++ b/fs/tux3/iattr.c @@ -0,0 +1,187 @@ +/* + * Inode table attributes + * + * Original copyright (c) 2008 Daniel Phillips + * Licensed under the GPL version 3 + * + * By contributing changes to this file you grant the original copyright holder + * the right to distribute those changes under any license. + */ + +#include "tux3.h" + +/* + * Variable size attribute format: + * + * immediate data: kind+version:16, bytes:16, data[bytes] + * immediate xattr: kind+version:16, bytes:16, atom:16, data[bytes - 2] + */ + +unsigned atsize[MAX_ATTRS] = { + [MODE_OWNER_ATTR] = 12, + [CTIME_SIZE_ATTR] = 14, + [DATA_BTREE_ATTR] = 8, + [LINK_COUNT_ATTR] = 4, + [MTIME_ATTR] = 6, + [IDATA_ATTR] = 2, + [XATTR_ATTR] = 4, +}; + +unsigned encode_asize(unsigned bits) +{ + unsigned need = 0; + for (int kind = MIN_ATTR; kind < VAR_ATTRS; kind++) + if ((bits & (1 << kind))) + need += atsize[kind] + 2; + return need; +} + +/* unused */ +int attr_check(void *attrs, unsigned size) +{ + void *limit = attrs + size; + unsigned head; + while (attrs < limit - 1) + { + attrs = decode16(attrs, &head); + unsigned kind = head >> 12; + if (kind < MIN_ATTR || kind >= MAX_ATTRS) + return 0; + if (attrs + atsize[kind] > limit) + return 0; + attrs += atsize[kind]; + } + return 1; +} + +void dump_attrs(struct inode *inode) +{ + //printf("present = %x\n", inode->present); + tuxnode_t *tuxnode = tux_inode(inode); + for (int kind = 0; kind < 32; kind++) { + if (!(tux_inode(inode)->present & (1 << kind))) + continue; + switch (kind) { + case MODE_OWNER_ATTR: + printf("mode 0%.6o uid %x gid %x ", inode->i_mode, inode->i_uid, inode->i_gid); + break; + case CTIME_SIZE_ATTR: + printf("ctime %Lx size %Lx ", (L)tuxtime(inode->i_ctime), (L)inode->i_size); + break; + case MTIME_ATTR: + printf("mtime %Lx ", (L)tuxtime(inode->i_mtime)); + break; + case LINK_COUNT_ATTR: + printf("links %u ", inode->i_nlink); + break; + case DATA_BTREE_ATTR: + printf("root %Lx:%u ", (L)tuxnode->btree.root.block, tuxnode->btree.root.depth); + break; + case XATTR_ATTR: + printf("xattr(s) "); + break; + default: + printf("<%i>? ", kind); + break; + } + } + printf("\n"); +} + +void *encode_attrs(struct inode *inode, void *attrs, unsigned size) +{ + trace_off("encode %u attr bytes", size); + tuxnode_t *tuxnode = tux_inode(inode); + void *limit = attrs + size - 3; + for (int kind = MIN_ATTR; kind < VAR_ATTRS; kind++) { + if (!(tuxnode->present & (1 << kind))) + continue; + if (attrs >= limit) + break; + attrs = encode_kind(attrs, kind, tux_sb(inode->i_sb)->version); + switch (kind) { + case MODE_OWNER_ATTR: + attrs = encode32(attrs, inode->i_mode); + attrs = encode32(attrs, inode->i_uid); + attrs = encode32(attrs, inode->i_gid); + break; + case CTIME_SIZE_ATTR: + attrs = encode48(attrs, tuxtime(inode->i_ctime) >> TIME_ATTR_SHIFT); + attrs = encode64(attrs, inode->i_size); + break; + case MTIME_ATTR: + attrs = encode48(attrs, tuxtime(inode->i_mtime) >> TIME_ATTR_SHIFT); + break; + case LINK_COUNT_ATTR: + attrs = encode32(attrs, inode->i_nlink); + break; + case DATA_BTREE_ATTR: + attrs = encode64(attrs, pack_root(&tuxnode->btree.root)); + break; + } + } + return attrs; +} + +void *decode_attrs(struct inode *inode, void *attrs, unsigned size) +{ + trace_off("decode %u attr bytes", size); + struct sb *sb = tux_sb(inode->i_sb); + tuxnode_t *tuxnode = tux_inode(inode); + struct xattr *xattr = tuxnode->xcache ? tuxnode->xcache->xattrs : NULL; + void *limit = attrs + size; + u64 v64; + u32 v32; + while (attrs < limit - 1) { + unsigned head; + attrs = decode16(attrs, &head); + unsigned version = head & 0xfff, kind = head >> 12; + if (version != sb->version) { + attrs += atsize[kind]; + continue; + } + switch (kind) { + case MODE_OWNER_ATTR: + attrs = decode32(attrs, &v32); + inode->i_mode = v32; + attrs = decode32(attrs, &v32); + inode->i_uid = v32; + attrs = decode32(attrs, &v32); + inode->i_gid = v32; + break; + case CTIME_SIZE_ATTR: + attrs = decode48(attrs, &v64); + attrs = decode64(attrs, (u64 *)&inode->i_size); // decode to temp? + inode->i_ctime = spectime(v64 << TIME_ATTR_SHIFT); + break; + case MTIME_ATTR: + attrs = decode48(attrs, &v64); + inode->i_mtime = spectime(v64 << TIME_ATTR_SHIFT); + break; + case LINK_COUNT_ATTR: + attrs = decode32(attrs, &inode->i_nlink); + break; + case DATA_BTREE_ATTR: + attrs = decode64(attrs, &v64); + init_btree(&tuxnode->btree, sb, unpack_root(v64), &dtree_ops); + break; + case XATTR_ATTR:; + // immediate xattr: kind+version:16, bytes:16, atom:16, data[bytes - 2] + unsigned bytes, atom; + attrs = decode16(attrs, &bytes); + attrs = decode16(attrs, &atom); + *xattr = (struct xattr){ .atom = atom, .size = bytes - 2 }; + unsigned xsize = sizeof(struct xattr) + xattr->size; + assert((void *)xattr + xsize <= (void *)tuxnode->xcache + tuxnode->xcache->maxsize); + memcpy(xattr->body, attrs, xattr->size); + attrs += xattr->size; + tuxnode->xcache->size += xsize; + xattr = xcache_next(xattr); // check limit!!! + break; + default: + return NULL; + } + tuxnode->present |= 1 << kind; + } + return attrs; +} diff --git a/fs/tux3/ileaf.c b/fs/tux3/ileaf.c new file mode 100644 index 0000000..e20796b --- /dev/null +++ b/fs/tux3/ileaf.c @@ -0,0 +1,303 @@ +/* + * Inode table btree leaf operations + * + * Original copyright (c) 2008 Daniel Phillips + * Licensed under the GPL version 3 + * + * By contributing changes to this file you grant the original copyright holder + * the right to distribute those changes under any license. + */ + +#include "tux3.h" + +struct ileaf { + be_u16 magic; /* Magic number */ + be_u16 count; /* Counts of used offset info entries */ + u32 pad; + be_u64 ibase; /* Base inode number */ + char table[]; /* ileaf data: inode attrs ... offset info */ +}; + +/* + * inode leaf format + * + * A leaf has a small header followed by a table of attributes. A vector of + * offsets within the block grows down from the top of the leaf towards the + * top of the attribute table, indexed by the difference between inum and + * leaf->ibase, the base inum of the table block. + */ + +static inline unsigned atdict(be_u16 *dict, unsigned at) +{ + return at ? from_be_u16(*(dict - at)) : 0; +} + +static inline void add_idict(be_u16 *dict, int n) +{ + *dict = to_be_u16(from_be_u16(*dict) + n); +} + +static inline unsigned icount(struct ileaf *leaf) +{ + return from_be_u16(leaf->count); +} + +static inline tuxkey_t ibase(struct ileaf *leaf) +{ + return from_be_u64(leaf->ibase); +} + +static void ileaf_btree_init(struct btree *btree) +{ + btree->entries_per_leaf = 1 << (btree->sb->blockbits - 6); +} + +static int ileaf_init(struct btree *btree, vleaf *leaf) +{ + printf("initialize inode leaf %p\n", leaf); + *(struct ileaf *)leaf = (struct ileaf){ to_be_u16(0x90de) }; + return 0; +} + +static int ileaf_sniff(struct btree *btree, vleaf *leaf) +{ + return ((struct ileaf *)leaf)->magic == to_be_u16(0x90de); +} + +static unsigned ileaf_need(struct btree *btree, vleaf *vleaf) +{ + be_u16 *dict = vleaf + btree->sb->blocksize; + unsigned count = icount(to_ileaf(vleaf)); + return atdict(dict, count) + count * sizeof(*dict); +} + +static unsigned ileaf_free(struct btree *btree, vleaf *leaf) +{ + return btree->sb->blocksize - ileaf_need(btree, leaf) - sizeof(struct ileaf); +} + +static void ileaf_dump(struct btree *btree, vleaf *vleaf) +{ + struct sb *sb = btree->sb; + struct ileaf *leaf = vleaf; + inum_t inum = ibase(leaf); + be_u16 *dict = vleaf + sb->blocksize; + unsigned offset = 0; + printf("inode table block 0x%Lx/%i (%x bytes free)\n", (L)ibase(leaf), icount(leaf), ileaf_free(btree, leaf)); + //hexdump(dict - icount(leaf), icount(leaf) * 2); + for (int i = -1; -i <= icount(leaf); i--, inum++) { + int limit = from_be_u16(dict[i]), size = limit - offset; + if (!size) + continue; + printf(" 0x%Lx: ", (L)inum); + //printf("[%x] ", offset); + if (size < 0) + printf("\n"); + else if (!size) + printf("\n"); + else { +#ifndef main + hexdump(leaf->table + offset, size); +#else + struct inode inode = { .i_sb = btree->sb }; + unsigned xsize = decode_xsize(&inode, leaf->table + offset, size); + inode.xcache = xsize ? new_xcache(xsize) : NULL; + decode_attrs(&inode, leaf->table + offset, size); + dump_attrs(&inode); + xcache_dump(&inode); + free(inode.xcache); +#endif + } + offset = limit; + } +} + +void *ileaf_lookup(struct btree *btree, inum_t inum, struct ileaf *leaf, unsigned *result) +{ + assert(inum >= ibase(leaf)); + assert(inum < ibase(leaf) + btree->entries_per_leaf); + unsigned at = inum - ibase(leaf), size = 0; + void *attrs = NULL; + printf("lookup inode 0x%Lx, %Lx + %x\n", (L)inum, (L)ibase(leaf), at); + if (at < icount(leaf)) { + be_u16 *dict = (void *)leaf + btree->sb->blocksize; + unsigned offset = atdict(dict, at); + if ((size = from_be_u16(*(dict - at - 1)) - offset)) + attrs = leaf->table + offset; + } + *result = size; + return attrs; +} + +static int isinorder(struct btree *btree, struct ileaf *leaf) +{ + be_u16 *dict = (void *)leaf + btree->sb->blocksize; + for (int i = 0, offset = 0, limit; --i >= -icount(leaf); offset = limit) + if ((limit = from_be_u16(dict[i])) < offset) + return 0; + return 1; +} + +/* userland only */ +int ileaf_check(struct btree *btree, struct ileaf *leaf) +{ + char *why; + why = "not an inode table leaf"; + if (leaf->magic != to_be_u16(0x90de)) + goto eek; + why = "dict out of order"; + if (!isinorder(btree, leaf)) + goto eek; + return 0; +eek: + printf("%s!\n", why); + return -1; +} + +static void ileaf_trim(struct btree *btree, struct ileaf *leaf) +{ + be_u16 *dict = (void *)leaf + btree->sb->blocksize; + while (icount(leaf) > 1 && *(dict - icount(leaf)) == *(dict - icount(leaf) + 1)) + leaf->count = to_be_u16(from_be_u16(leaf->count) - 1); + if (icount(leaf) == 1 && !*(dict - 1)) + leaf->count = 0; +} + +#define SPLIT_AT_INUM + +static tuxkey_t ileaf_split(struct btree *btree, tuxkey_t inum, vleaf *from, vleaf *into) +{ + assert(ileaf_sniff(btree, from)); + struct ileaf *leaf = from, *dest = into; + be_u16 *dict = from + btree->sb->blocksize, *destdict = into + btree->sb->blocksize; + +#ifdef SPLIT_AT_INUM + printf("split at inum 0x%Lx\n", (L)inum); + assert(inum >= ibase(leaf)); + unsigned at = inum - ibase(leaf) < icount(leaf) ? inum - ibase(leaf) : icount(leaf); +#else + /* binsearch inum starting nearest middle of block */ + unsigned at = 1, hi = icount(leaf); + while (at < hi) { + int mid = (at + hi) / 2; + if (*(dict - mid) < (btree->sb->blocksize / 2)) + at = mid + 1; + else + hi = mid; + } +#endif + /* should trim leading empty inodes on copy */ + unsigned split = atdict(dict, at), free = from_be_u16(*(dict - icount(leaf))); + printf("split at %x of %x\n", at, icount(leaf)); + printf("copy out %x bytes at %x\n", free - split, split); + assert(free >= split); + memcpy(dest->table, leaf->table + split, free - split); + dest->count = to_be_u16(icount(leaf) - at); + veccopy(destdict - icount(dest), dict - icount(leaf), icount(dest)); + for (int i = 1; i <= icount(dest); i++) + add_idict(destdict - i, -split); +#ifdef SPLIT_AT_INUM + /* round down to multiple of 64 above ibase */ + inum_t round = inum & ~(inum_t)(btree->entries_per_leaf - 1); + dest->ibase = to_be_u64(round > ibase(leaf) + icount(leaf) ? round : inum); +#else + dest->ibase = to_be_u64(ibase(leaf) + at); +#endif + leaf->count = to_be_u16(at); + memset(leaf->table + split, 0, (char *)(dict - icount(leaf)) - (leaf->table + split)); + ileaf_trim(btree, leaf); + return ibase(dest); +} + +/* userland only */ +void ileaf_merge(struct btree *btree, struct ileaf *leaf, struct ileaf *from) +{ + if (!icount(from)) + return; + be_u16 *dict = (void *)leaf + btree->sb->blocksize; + be_u16 *fromdict = (void *)from + btree->sb->blocksize; + unsigned at = icount(leaf), free = atdict(dict, at), size = atdict(fromdict, icount(from)); + printf("copy in %i bytes\n", size); + memcpy(leaf->table + free, from->table, size); + leaf->count = to_be_u16(from_be_u16(leaf->count) + icount(from)); + veccopy(dict - icount(leaf), fromdict - icount(from), icount(from)); + for (int i = at + 1; at && i <= at + icount(from); i++) + add_idict(dict - i, from_be_u16(*(dict - at))); +} + +static void *ileaf_resize(struct btree *btree, tuxkey_t inum, vleaf *base, unsigned newsize) +{ + assert(ileaf_sniff(btree, base)); + struct ileaf *leaf = base; + assert(inum >= ibase(leaf)); + be_u16 *dict = base + btree->sb->blocksize; + + unsigned at = inum - ibase(leaf); + if (at >= btree->entries_per_leaf) + return NULL; + + unsigned extend_empty = at < icount(leaf) ? 0 : at - icount(leaf) + 1; + unsigned offset = at && icount(leaf) ? from_be_u16(*(dict - (at < icount(leaf) ? at : icount(leaf)))) : 0; + unsigned size = at < icount(leaf) ? from_be_u16(*(dict - at - 1)) - offset : 0; + int more = newsize - size; + if (more > 0 && sizeof(*dict) * extend_empty + more > ileaf_free(btree, leaf)) + return NULL; + for (; extend_empty--; leaf->count = to_be_u16(from_be_u16(leaf->count) + 1)) + *(dict - icount(leaf) - 1) = to_be_u16(atdict(dict, icount(leaf))); + assert(icount(leaf)); + unsigned itop = from_be_u16(*(dict - icount(leaf))); + void *attrs = leaf->table + offset; + printf("resize inum 0x%Lx at 0x%x from %x to %x\n", (L)inum, offset, size, newsize); + + assert(itop >= offset + size); + memmove(attrs + newsize, attrs + size, itop - offset - size); + for (int i = at + 1; i <= icount(leaf); i++) + add_idict(dict - i, more); + return attrs; +} + +inum_t find_empty_inode(struct btree *btree, struct ileaf *leaf, inum_t goal) +{ + assert(goal >= ibase(leaf)); + goal -= ibase(leaf); + //printf("find empty inode starting at %Lx, base %Lx\n", (L)goal, (L)ibase(leaf)); + be_u16 *dict = (void *)leaf + btree->sb->blocksize; + unsigned i, offset = goal && goal < icount(leaf) ? from_be_u16(*(dict - goal)) : 0; + for (i = goal; i < icount(leaf); i++) { + unsigned limit = from_be_u16(*(dict - i - 1)); + if (offset == limit) + break; + offset = limit; + } + return i + ibase(leaf); +} + +int ileaf_purge(struct btree *btree, inum_t inum, struct ileaf *leaf) +{ + if (inum < ibase(leaf) || inum - ibase(leaf) >= btree->entries_per_leaf) + return -EINVAL; + be_u16 *dict = (void *)leaf + btree->sb->blocksize; + unsigned at = inum - ibase(leaf); + unsigned offset = atdict(dict, at); + unsigned size = from_be_u16(*(dict - at - 1)) - offset; + printf("delete inode %Lx from %p[%x/%x]\n", (L)inum, leaf, at, size); + if (!size) + return -ENOENT; + unsigned free = from_be_u16(*(dict - icount(leaf))), tail = free - offset - size; + assert(offset + size + tail <= free); + memmove(leaf->table + offset, leaf->table + offset + size, tail); + for (int i = at + 1; i <= icount(leaf); i++) + add_idict(dict - i, -size); + ileaf_trim(btree, leaf); + return 0; +} + +struct btree_ops itable_ops = { + .btree_init = ileaf_btree_init, + .leaf_dump = ileaf_dump, + .leaf_sniff = ileaf_sniff, + .leaf_init = ileaf_init, + .leaf_split = ileaf_split, + .leaf_resize = ileaf_resize, + .balloc = balloc, +}; diff --git a/fs/tux3/inode.c b/fs/tux3/inode.c new file mode 100644 index 0000000..3ed4f36 --- /dev/null +++ b/fs/tux3/inode.c @@ -0,0 +1,501 @@ +/* + * Tux3 versioning filesystem in user space + * + * Original copyright (c) 2008 Daniel Phillips + * Licensed under the GPL version 3 + * + * By contributing changes to this file you grant the original copyright holder + * the right to distribute those changes under any license. + */ + +#include "tux3.h" + +#ifndef trace +#define trace trace_on +#endif + +static int check_present(struct inode *inode) +{ + tuxnode_t *tuxnode = tux_inode(inode); + switch (inode->i_mode & S_IFMT) { + default: + assert(tuxnode->present & MODE_OWNER_BIT); + /* FIXME: assert(!(tuxnode->present & RDEV_BIT)) */ + break; + case S_IFBLK: + case S_IFCHR: + assert(tuxnode->present & MODE_OWNER_BIT); + /* FIXME: assert(tuxnode->present & RDEV_BIT) */ + break; + case S_IFREG: + assert(tuxnode->present & MODE_OWNER_BIT); + assert(tuxnode->present & DATA_BTREE_BIT); + /* FIXME: assert(!(tuxnode->present & RDEV_BIT)) */ + break; + case S_IFDIR: + assert(tuxnode->present & MODE_OWNER_BIT); + assert(tuxnode->present & DATA_BTREE_BIT); + /* FIXME: assert(!(tuxnode->present & RDEV_BIT)) */ + break; + case S_IFLNK: + assert(tuxnode->present & MODE_OWNER_BIT); + assert(tuxnode->present & DATA_BTREE_BIT); + /* FIXME: assert(!(tuxnode->present & RDEV_BIT)) */ + break; + case 0: + assert(tuxnode->present & DATA_BTREE_BIT); + /* FIXME: assert(!(tuxnode->present & RDEV_BIT)) */ + break; + } + return 0; +} + +static inline void tux_set_inum(struct inode *inode, inum_t inum) +{ +#ifdef __KERNEL__ + inode->i_ino = inum; +#endif + tux_inode(inode)->inum = inum; +} + +#ifdef __KERNEL__ +static void tux_setup_inode(struct inode *inode, dev_t rdev); +#else +static void tux_setup_inode(struct inode *inode, dev_t rdev) +{ + inode->i_rdev = rdev; +} +#endif + +static struct inode *tux_new_inode(struct inode *dir, struct tux_iattr *iattr, + dev_t rdev) +{ + struct inode *inode = new_inode(dir->i_sb); + if (!inode) + return NULL; + + inode->i_mode = iattr->mode; + inode->i_uid = iattr->uid; + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(inode->i_mode)) + inode->i_mode |= S_ISGID; + } else + inode->i_gid = iattr->gid; + inode->i_mtime = inode->i_ctime = inode->i_atime = gettime(); + if (S_ISDIR(inode->i_mode)) + inode->i_nlink++; + tux_set_inum(inode, TUX_INVALID_INO); + tux_inode(inode)->present = CTIME_SIZE_BIT|MTIME_BIT|MODE_OWNER_BIT|DATA_BTREE_BIT|LINK_COUNT_BIT; + tux_setup_inode(inode, rdev); + return inode; +} + +static int open_inode(struct inode *inode) +{ + struct sb *sb = tux_sb(inode->i_sb); + int err; + struct cursor *cursor = alloc_cursor(&sb->itable, 0); + if (!cursor) + return -ENOMEM; + down_read(&cursor->btree->lock); + if ((err = probe(&sb->itable, tux_inode(inode)->inum, cursor))) + goto out; + unsigned size; + void *attrs = ileaf_lookup(&sb->itable, tux_inode(inode)->inum, bufdata(cursor_leafbuf(cursor)), &size); + if (!attrs) { + err = -ENOENT; + goto release; + } + trace("found inode 0x%Lx", (L)tux_inode(inode)->inum); + //ileaf_dump(&sb->itable, bufdata(cursor[depth].buffer)); + //hexdump(attrs, size); + unsigned xsize = decode_xsize(inode, attrs, size); + err = -ENOMEM; + if (xsize && !(tux_inode(inode)->xcache = new_xcache(xsize))) + goto release; + decode_attrs(inode, attrs, size); // error??? + dump_attrs(inode); + if (tux_inode(inode)->xcache) + xcache_dump(inode); + check_present(inode); + tux_setup_inode(inode, inode->i_rdev); + err = 0; +release: + release_cursor(cursor); +out: + up_read(&cursor->btree->lock); + free_cursor(cursor); + return err; +} + +static int store_attrs(struct inode *inode, struct cursor *cursor) +{ + unsigned size = encode_asize(tux_inode(inode)->present) + encode_xsize(inode); + void *base = tree_expand(&tux_sb(inode->i_sb)->itable, tux_inode(inode)->inum, size, cursor); + if (!base) + return -ENOMEM; // ERR_PTR me!!! + void *attr = encode_attrs(inode, base, size); + attr = encode_xattrs(inode, attr, base + size - attr); + assert(attr == base + size); + mark_buffer_dirty(cursor_leafbuf(cursor)); + return 0; +} + +/* + * Inode table expansion algorithm + * + * First probe for the inode goal. This retreives the rightmost leaf that + * contains an inode less than or equal to the goal. (We could in theory avoid + * retrieving any leaf at all in some cases if we observe that the the goal must + * fall into an unallocated gap between two index keys, for what that is worth. + * Probably not very much.) + * + * If not at end then next key is greater than goal. This block has the highest + * ibase less than or equal to goal. Ibase should be equal to btree key, so + * assert. Search block even if ibase is way too low. If goal comes back equal + * to next_key then there is no room to create more inodes in it, so advance to + * the next block and repeat. + * + * Otherwise, expand the inum goal that came back. If ibase was too low to + * create the inode in that block then the low level split will fail and expand + * will create a new inode table block with ibase at the goal. We round the + * goal down to some binary multiple in ileaf_split to reduce the chance of + * creating inode table blocks with only a small number of inodes. (Actually + * we should only round down the split point, not the returned goal.) + */ + +static int make_inode(struct inode *inode, inum_t goal) +{ + struct sb *sb = tux_sb(inode->i_sb); + int err = -ENOENT, depth = sb->itable.root.depth; + struct cursor *cursor = alloc_cursor(&sb->itable, 1); /* +1 for now depth */ + if (!cursor) + return -ENOMEM; + down_write(&cursor->btree->lock); + if ((err = probe(&sb->itable, goal, cursor))) + goto out; + struct buffer_head *leafbuf = cursor_leafbuf(cursor); + + /* FIXME: inum allocation should check min and max */ + trace("create inode 0x%Lx", (L)goal); + assert(!tux_inode(inode)->btree.root.depth); + assert(goal < next_key(cursor, depth)); + while (1) { + trace_off("find empty inode in [%Lx] base %Lx", (L)bufindex(leafbuf), (L)ibase(leaf)); + goal = find_empty_inode(&sb->itable, bufdata(leafbuf), goal); + trace("result inum is %Lx, limit is %Lx", (L)goal, (L)next_key(cursor, depth)); + if (goal < next_key(cursor, depth)) + break; + int more = advance(&sb->itable, cursor); + if (more < 0) { + err = more; + goto out; + } + trace("no more inode space here, advance %i", more); + if (!more) { + err = -ENOSPC; + goto release; + } + } + + tux_set_inum(inode, goal); + if (tux_inode(inode)->present & DATA_BTREE_BIT) + if ((err = new_btree(&tux_inode(inode)->btree, sb, &dtree_ops))) + goto release; + if ((err = store_attrs(inode, cursor))) + goto out; +release: + release_cursor(cursor); +out: + up_write(&cursor->btree->lock); + free_cursor(cursor); + return err; +} + +static int save_inode(struct inode *inode) +{ + assert(tux_inode(inode)->inum != TUX_INVALID_INO); + trace("save inode 0x%Lx", (L)tux_inode(inode)->inum); + struct sb *sb = tux_sb(inode->i_sb); + int err; + struct cursor *cursor = alloc_cursor(&sb->itable, 1); /* +1 for new depth */ + if (!cursor) + return -ENOMEM; + down_write(&cursor->btree->lock); + if ((err = probe(&sb->itable, tux_inode(inode)->inum, cursor))) + goto out; + /* paranoia check */ + unsigned size; + if (!(ileaf_lookup(&sb->itable, tux_inode(inode)->inum, bufdata(cursor_leafbuf(cursor)), &size))) { + err = -EINVAL; + goto release; + } + if ((err = store_attrs(inode, cursor))) + goto out; +release: + release_cursor(cursor); +out: + up_write(&cursor->btree->lock); + free_cursor(cursor); + return err; +} + +static int purge_inum(struct btree *btree, inum_t inum) +{ + struct cursor *cursor = alloc_cursor(btree, 0); + if (!cursor) + return -ENOMEM; + + int err = -ENOENT; + if (!(err = probe(btree, inum, cursor))) { + /* FIXME: truncate the bnode and leaf if empty. */ + struct buffer_head *ileafbuf = cursor_leafbuf(cursor); + struct ileaf *ileaf = to_ileaf(bufdata(ileafbuf)); + err = ileaf_purge(btree, inum, ileaf); + if (!err) + mark_buffer_dirty(ileafbuf); + release_cursor(cursor); + } + free_cursor(cursor); + return err; +} + +#ifdef __KERNEL__ +static int tux_can_truncate(struct inode *inode) +{ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return 0; + if (S_ISREG(inode->i_mode)) + return 1; + if (S_ISDIR(inode->i_mode)) + return 1; + if (S_ISLNK(inode->i_mode)) + return 1; + return 0; +} + +static void tux3_truncate(struct inode *inode) +{ + struct sb *sb = tux_sb(inode->i_sb); + struct delete_info del_info = { + .key = (inode->i_size + sb->blockmask) >> sb->blockbits, + }; + int err; + + if (!tux_can_truncate(inode)) + return; + /* FIXME: must fix expand size */ + WARN_ON(inode->i_size); + block_truncate_page(inode->i_mapping, inode->i_size, tux3_get_block); + err = tree_chop(&tux_inode(inode)->btree, &del_info, 0); + inode->i_blocks = ((inode->i_size + sb->blockmask) + & ~(loff_t)sb->blockmask) >> 9; + inode->i_mtime = inode->i_ctime = gettime(); + mark_inode_dirty(inode); +} + +void tux3_delete_inode(struct inode *inode) +{ + struct sb *sb = tux_sb(inode->i_sb); + inum_t inum = tux_inode(inode)->inum; + + truncate_inode_pages(&inode->i_data, 0); + if (is_bad_inode(inode)) { + clear_inode(inode); + return; + } + inode->i_size = 0; + if (inode->i_blocks) + tux3_truncate(inode); + /* FIXME: we have to free dtree-root, atable entry, etc too */ + + /* clear_inode() before freeing this ino. */ + clear_inode(inode); + + purge_inum(&sb->itable, inum); +} + +void tux3_clear_inode(struct inode *inode) +{ + if (tux_inode(inode)->xcache) + kfree(tux_inode(inode)->xcache); +} + +int tux3_write_inode(struct inode *inode, int do_sync) +{ + BUG_ON(tux_inode(inode)->inum == TUX_BITMAP_INO || + tux_inode(inode)->inum == TUX_VTABLE_INO || + tux_inode(inode)->inum == TUX_ATABLE_INO); + return save_inode(inode); +} + +int tux3_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + generic_fillattr(inode, stat); + stat->ino = tux_inode(inode)->inum; + return 0; +} + +int tux3_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + int err; + + err = inode_change_ok(inode, iattr); + if (err) + return err; + return inode_setattr(inode, iattr); +} + +static const struct file_operations tux_file_fops = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, +// .unlocked_ioctl = fat_generic_ioctl, +#ifdef CONFIG_COMPAT +// .compat_ioctl = fat_compat_dir_ioctl, +#endif + .mmap = generic_file_mmap, + .open = generic_file_open, +// .fsync = file_fsync, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, +}; + +static const struct inode_operations tux_file_iops = { + .truncate = tux3_truncate, +// .permission = ext4_permission, + .setattr = tux3_setattr, + .getattr = tux3_getattr +#ifdef CONFIG_EXT4DEV_FS_XATTR +// .setxattr = generic_setxattr, +// .getxattr = generic_getxattr, +// .listxattr = ext4_listxattr, +// .removexattr = generic_removexattr, +#endif +// .fallocate = ext4_fallocate, +// .fiemap = ext4_fiemap, +}; + +static const struct inode_operations tux_special_iops = { +// .permission = ext4_permission, +// .setattr = ext4_setattr, + .getattr = tux3_getattr +#ifdef CONFIG_EXT4DEV_FS_XATTR +// .setxattr = generic_setxattr, +// .getxattr = generic_getxattr, +// .listxattr = ext4_listxattr, +// .removexattr = generic_removexattr, +#endif +}; + +const struct inode_operations tux_symlink_iops = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .getattr = tux3_getattr, +#if 0 +// .setxattr = generic_setxattr, +// .getxattr = generic_getxattr, +// .listxattr = ext4_listxattr, +// .removexattr = generic_removexattr, +#endif +}; + +static void tux_setup_inode(struct inode *inode, dev_t rdev) +{ + struct sb *sbi = tux_sb(inode->i_sb); + + inode->i_blocks = ((inode->i_size + sbi->blockmask) + & ~(loff_t)sbi->blockmask) >> 9; +// inode->i_generation = 0; +// inode->i_flags = 0; + + switch (inode->i_mode & S_IFMT) { + default: + inode->i_op = &tux_special_iops; + init_special_inode(inode, inode->i_mode, rdev); + break; + case S_IFREG: + inode->i_op = &tux_file_iops; + inode->i_fop = &tux_file_fops; + inode->i_mapping->a_ops = &tux_aops; + break; + case S_IFDIR: + inode->i_op = &tux_dir_iops; + inode->i_fop = &tux_dir_fops; + inode->i_mapping->a_ops = &tux_blk_aops; +// mapping_set_gfp_mask(inode->i_mapping, GFP_USER_PAGECACHE); + mapping_set_gfp_mask(inode->i_mapping, GFP_USER); + break; + case S_IFLNK: + inode->i_op = &tux_symlink_iops; + inode->i_mapping->a_ops = &tux_aops; + break; + case 0: + /* FIXME: bitmap, vtable, atable doesn't have S_IFMT */ + /* set fake i_size to escape the check of read/writepage */ + inode->i_size = MAX_LFS_FILESIZE; + inode->i_mapping->a_ops = &tux_blk_aops; +// mapping_set_gfp_mask(inode->i_mapping, GFP_USER_PAGECACHE); + mapping_set_gfp_mask(inode->i_mapping, GFP_USER); + break; + } +} + +struct inode *tux_create_inode(struct inode *dir, int mode, dev_t rdev) +{ + struct tux_iattr iattr = { + .uid = current->fsuid, + .gid = current->fsgid, + .mode = mode, + }; + struct inode *inode = tux_new_inode(dir, &iattr, rdev); + if (!inode) + return ERR_PTR(-ENOMEM); + int err = make_inode(inode, tux_sb(dir->i_sb)->nextalloc); + if (err) { + make_bad_inode(inode); + iput(inode); + return ERR_PTR(err); + } + insert_inode_hash(inode); + return inode; +} + +static int tux_test(struct inode *inode, void *data) +{ + return tux_inode(inode)->inum == *(inum_t *)data; +} + +static int tux_set(struct inode *inode, void *data) +{ + tux_set_inum(inode, *(inum_t *)data); + return 0; +} + +struct inode *tux3_iget(struct super_block *sb, inum_t inum) +{ + struct inode *inode; + int err; + + inode = iget5_locked(sb, inum, tux_test, tux_set, &inum); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + err = open_inode(inode); + if (err) { + iget_failed(inode); + return ERR_PTR(err); + } + unlock_new_inode(inode); + return inode; +} + +#endif /* !__KERNEL__ */ diff --git a/fs/tux3/namei.c b/fs/tux3/namei.c new file mode 100644 index 0000000..1fc2daf --- /dev/null +++ b/fs/tux3/namei.c @@ -0,0 +1,261 @@ +#include "tux3.h" + +static struct dentry *tux3_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + struct buffer_head *buffer; + struct inode *inode; + tux_dirent *entry; + + entry = tux_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &buffer); + if (IS_ERR(entry)) { + if (PTR_ERR(entry) != -ENOENT) + return ERR_PTR(PTR_ERR(entry)); + inode = NULL; + goto out; + } + inode = tux3_iget(dir->i_sb, from_be_u64(entry->inum)); + brelse(buffer); + if (IS_ERR(inode)) + return ERR_PTR(PTR_ERR(inode)); +out: + return d_splice_alias(inode, dentry); +} + +static int __tux_add_dirent(struct inode *dir, struct dentry *dentry, struct inode *inode) +{ + loff_t where; + + where = tux_create_entry(dir, dentry->d_name.name, dentry->d_name.len, + tux_inode(inode)->inum, inode->i_mode); + if (where < 0) + return where; + return 0; +} + +static int tux_add_dirent(struct inode *dir, struct dentry *dentry, struct inode *inode) +{ + int err = __tux_add_dirent(dir, dentry, inode); + if (!err) + d_instantiate(dentry, inode); + return err; +} + +static int tux_update_dirent(struct buffer_head *buffer, tux_dirent *entry, struct inode *inode) +{ + return tux_update_entry(buffer, entry, tux_inode(inode)->inum, inode->i_mode); +} + +static int tux_del_dirent(struct inode *dir, struct dentry *dentry) +{ + struct buffer_head *buffer; + tux_dirent *entry = tux_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &buffer); + return IS_ERR(entry) ? PTR_ERR(entry) : tux_delete_entry(buffer, entry); +} + +static int tux3_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) +{ + struct inode *inode; + int err; + +// if (!huge_valid_dev(rdev)) +// return -EINVAL; + + inode = tux_create_inode(dir, mode, rdev); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + err = tux_add_dirent(dir, dentry, inode); + if (!err) + return 0; + inode_dec_link_count(inode); + iput(inode); + } + return err; +} + +static int tux3_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) +{ + return tux3_mknod(dir, dentry, mode, 0); +} + +static int tux3_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + int err; + if (dir->i_nlink >= TUX_LINK_MAX) + return -EMLINK; + err = tux3_mknod(dir, dentry, S_IFDIR | mode, 0); + if (!err) + inode_inc_link_count(dir); + return err; +} + +static int tux3_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + int err; + + if (inode->i_nlink >= TUX_LINK_MAX) + return -EMLINK; + + inode->i_ctime = gettime(); + inode_inc_link_count(inode); + atomic_inc(&inode->i_count); + err = tux_add_dirent(dir, dentry, inode); + if (err) { + inode_dec_link_count(inode); + iput(inode); + } + return err; +} + +static int tux3_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct inode *inode; + int err; + + inode = tux_create_inode(dir, S_IFLNK | S_IRWXUGO, 0); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + err = page_symlink(inode, symname, strlen(symname) + 1); + if (!err) { + err = tux_add_dirent(dir, dentry, inode); + if (!err) + return 0; + } + inode_dec_link_count(inode); + iput(inode); + } + return err; +} + +static int tux3_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + int err = tux_del_dirent(dir, dentry); + if (!err) { + inode->i_ctime = dir->i_ctime; + inode_dec_link_count(inode); + } + return err; +} + +static int tux3_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + int err; + + err = tux_dir_is_empty(inode); + if (!err) { + err = tux_del_dirent(dir, dentry); + if (!err) { + inode->i_ctime = dir->i_ctime; + inode->i_size = 0; + clear_nlink(inode); + mark_inode_dirty(inode); + inode_dec_link_count(dir); + } + } + return err; +} + +static int tux3_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct buffer_head *old_buffer, *new_buffer; + tux_dirent *old_entry, *new_entry; + int err, new_subdir = 0; + + old_entry = tux_find_entry(old_dir, old_dentry->d_name.name, + old_dentry->d_name.len, &old_buffer); + if (IS_ERR(old_entry)) + return PTR_ERR(old_entry); + + /* FIXME: is this needed? */ + BUG_ON(from_be_u64(old_entry->inum) != tux_inode(old_inode)->inum); + + if (new_inode) { + int old_is_dir = S_ISDIR(old_inode->i_mode); + if (old_is_dir) { + err = tux_dir_is_empty(new_inode); + if (err) + goto error; + } + + new_entry = tux_find_entry(new_dir, new_dentry->d_name.name, + new_dentry->d_name.len, &new_buffer); + if (IS_ERR(new_entry)) { + BUG_ON(PTR_ERR(new_entry) == -ENOENT); + err = PTR_ERR(new_entry); + goto error; + } + err = tux_update_dirent(new_buffer, new_entry, old_inode); + if (err) + goto error; + new_inode->i_ctime = new_dir->i_ctime; + if (old_is_dir) + drop_nlink(new_inode); + inode_dec_link_count(new_inode); + } else { + new_subdir = S_ISDIR(old_inode->i_mode) && new_dir != old_dir; + if (new_subdir) { + if (new_dir->i_nlink >= TUX_LINK_MAX) { + err = -EMLINK; + goto error; + } + } + err = __tux_add_dirent(new_dir, new_dentry, old_inode); + if (err) + goto error; + if (new_subdir) + inode_inc_link_count(new_dir); + } + old_inode->i_ctime = new_dir->i_ctime; + mark_inode_dirty(old_inode); + + err = tux_delete_entry(old_buffer, old_entry); + if (err) { + printk(KERN_ERR "TUX3: %s: couldn't delete old entry (%Lu)\n", + __func__, (L)tux_inode(old_inode)->inum); + /* FIXME: now, we have hardlink even if it's dir. */ + inode_inc_link_count(old_inode); + } + if (!err && new_subdir) + inode_dec_link_count(old_dir); + + return err; + +error: + brelse(old_buffer); + return err; +} + +const struct file_operations tux_dir_fops = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = tux_readdir, +}; + +const struct inode_operations tux_dir_iops = { + .create = tux3_create, + .lookup = tux3_lookup, + .link = tux3_link, + .unlink = tux3_unlink, + .symlink = tux3_symlink, + .mkdir = tux3_mkdir, + .rmdir = tux3_rmdir, + .mknod = tux3_mknod, + .rename = tux3_rename, +// .setattr = ext3_setattr, + .getattr = tux3_getattr +// .setxattr = generic_setxattr, +// .getxattr = generic_getxattr, +// .listxattr = ext3_listxattr, +// .removexattr = generic_removexattr, +// .permission = ext3_permission, + /* FIXME: why doesn't ext4 support this for directory? */ +// .fallocate = ext4_fallocate, +// .fiemap = ext4_fiemap, +}; diff --git a/fs/tux3/super.c b/fs/tux3/super.c new file mode 100644 index 0000000..a600ced --- /dev/null +++ b/fs/tux3/super.c @@ -0,0 +1,306 @@ +/* + * tux3/super.c + * Copyright (c) 2008, Daniel Phillips + * Portions copyright (c) 2008, Maciej Zenczykowski + */ + +#ifdef __KERNEL__ +#include +#include +#include +#include +#include +#include +#endif + +#include "tux3.h" + +static int unpack_sb(struct sb *sb, struct disksuper *super, int silent) +{ + u64 iroot = from_be_u64(super->iroot); + if (memcmp(super->magic, (char[])SB_MAGIC, sizeof(super->magic))) { + if (!silent) + printf("invalid superblock [%Lx]\n", + (L)from_be_u64(*(be_u64 *)super->magic)); + return -EINVAL; + } + +// sb->rootbuf; + sb->blockbits = from_be_u16(super->blockbits); + sb->blocksize = 1 << sb->blockbits; + sb->blockmask = (1 << sb->blockbits) - 1; + /* FIXME: those should be initialized based on blocksize. */ + sb->entries_per_node = 20; + sb->max_inodes_per_block = 64; +// sb->version; + sb->atomref_base = 1 << (40 - sb->blockbits); // see xattr.c + sb->unatom_base = sb->atomref_base + (1 << (34 - sb->blockbits)); + sb->volblocks = from_be_u64(super->volblocks); + sb->freeblocks = from_be_u64(super->freeblocks); + sb->nextalloc = from_be_u64(super->nextalloc); + sb->atomgen = from_be_u32(super->atomgen); + sb->freeatom = from_be_u32(super->freeatom); + sb->dictsize = from_be_u64(super->dictsize); + + init_btree(&sb->itable, sb, unpack_root(iroot), &itable_ops); + + return 0; +} + +static void pack_sb(struct sb *sb, struct disksuper *super) +{ + super->blockbits = to_be_u16(sb->blockbits); + super->volblocks = to_be_u64(sb->volblocks); + super->freeblocks = to_be_u64(sb->freeblocks); // probably does not belong here + super->nextalloc = to_be_u64(sb->nextalloc); // probably does not belong here + super->atomgen = to_be_u32(sb->atomgen); // probably does not belong here + super->freeatom = to_be_u32(sb->freeatom); // probably does not belong here + super->dictsize = to_be_u64(sb->dictsize); // probably does not belong here + super->iroot = to_be_u64(pack_root(&sb->itable.root)); +} + +#ifdef __KERNEL__ +static struct kmem_cache *tux_inode_cachep; + +static void tux3_inode_init_once(struct kmem_cache *cachep, void *mem) +{ + inode_init_once(&((tuxnode_t *)mem)->vfs_inode); +} + +static int __init tux3_init_inodecache(void) +{ + tux_inode_cachep = kmem_cache_create("tux_inode_cache", + sizeof(tuxnode_t), 0, (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), + tux3_inode_init_once); + if (tux_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void __exit tux3_destroy_inodecache(void) +{ + kmem_cache_destroy(tux_inode_cachep); +} + +static struct inode *tux3_alloc_inode(struct super_block *sb) +{ + static struct timespec epoch; + tuxnode_t *tuxi = kmem_cache_alloc(tux_inode_cachep, GFP_KERNEL); + if (!tuxi) + return NULL; + tuxi->btree = (struct btree){ }; + tuxi->present = 0; + tuxi->xcache = NULL; + + /* uninitialized stuff by alloc_inode() */ + tuxi->vfs_inode.i_version = 1; + tuxi->vfs_inode.i_uid = 0; + tuxi->vfs_inode.i_gid = 0; + tuxi->vfs_inode.i_atime = epoch; + tuxi->vfs_inode.i_mtime = epoch; + tuxi->vfs_inode.i_ctime = epoch; + tuxi->vfs_inode.i_mode = 0; + return &tuxi->vfs_inode; +} + +static void tux3_destroy_inode(struct inode *inode) +{ + kmem_cache_free(tux_inode_cachep, tux_inode(inode)); +} + +static int tux_load_sb(struct super_block *sb, int silent) +{ + struct buffer_head *bh; + int err; + + BUG_ON(SB_LOC < sb->s_blocksize); + bh = sb_bread(sb, SB_LOC >> sb->s_blocksize_bits); + if (!bh) { + if (!silent) + printk(KERN_ERR "TUX3: unable to read superblock\n"); + return -EIO; + } + err = unpack_sb(tux_sb(sb), bufdata(bh), silent); + /* FIXME: this is needed? */ + memcpy(&tux_sb(sb)->super, bufdata(bh), sizeof(tux_sb(sb)->super)); + brelse(bh); + + return err; +} + +static void tux3_write_super(struct super_block *sb) +{ + struct buffer_head *bh; + + BUG_ON(SB_LOC < sb->s_blocksize); + bh = sb_bread(sb, SB_LOC >> sb->s_blocksize_bits); + if (!bh) { + printk(KERN_ERR "TUX3: unable to read superblock\n"); + return; + } + pack_sb(tux_sb(sb), bufdata(bh)); + brelse_dirty(bh); + sb->s_dirt = 0; +} + +static void tux3_put_super(struct super_block *sb) +{ + struct sb *sbi = tux_sb(sb); + + /* FIXME: remove this, then use sb->s_dirt instead */ + tux3_write_super(sb); + + iput(sbi->atable); + iput(sbi->bitmap); + + sb->s_fs_info = NULL; + kfree(sbi); +} + +static int tux3_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct sb *sbi = tux_sb(sb); + + buf->f_type = sb->s_magic; + buf->f_bsize = sbi->blocksize; + buf->f_blocks = sbi->volblocks; + buf->f_bfree = sbi->freeblocks; + buf->f_bavail = sbi->freeblocks; +#if 0 + buf->f_files = buf->f_blocks << (sbi->clus_bits - EXFAT_CHUNK_BITS) / 3; + buf->f_ffree = buf->f_blocks << (sbi->clus_bits - EXFAT_CHUNK_BITS) / 3; + buf->f_fsid.val[0] = sbi->serial_number; + /*buf->f_fsid.val[1];*/ +#endif + buf->f_namelen = TUX_NAME_LEN; +// buf->f_frsize = sbi->blocksize; + + return 0; +} + +static const struct super_operations tux3_super_ops = { + .alloc_inode = tux3_alloc_inode, + .destroy_inode = tux3_destroy_inode, + .delete_inode = tux3_delete_inode, + .clear_inode = tux3_clear_inode, + .write_inode = tux3_write_inode, + .write_super = tux3_write_super, + .put_super = tux3_put_super, + .statfs = tux3_statfs, +}; + +static int tux3_fill_super(struct super_block *sb, void *data, int silent) +{ + struct sb *sbi; + int err, blocksize; + + sbi = kzalloc(sizeof(struct sb), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + sb->s_fs_info = sbi; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_magic = 0x54555833; + sb->s_op = &tux3_super_ops; + sb->s_time_gran = 1; + + sbi->vfs_sb = sb; + + err = -EIO; + blocksize = sb_min_blocksize(sb, BLOCK_SIZE); + if (!blocksize) { + if (!silent) + printk(KERN_ERR "TUX3: unable to set blocksize\n"); + goto error; + } + + err = tux_load_sb(sb, silent); + if (err) + goto error; + printk("%s: sb %p, ops %p, depth %Lu, block %Lu, entries_per_leaf %d\n", + __func__, + sbi->itable.sb, sbi->itable.ops, + (L)sbi->itable.root.depth, (L)sbi->itable.root.block, + sbi->itable.entries_per_leaf); + printk("%s: blocksize %u, blockbits %u, blockmask %08x\n", + __func__, sbi->blocksize, sbi->blockbits, sbi->blockmask); + printk("%s: volblocks %Lu, freeblocks %Lu, nextalloc %Lu\n", + __func__, sbi->volblocks, sbi->freeblocks, sbi->nextalloc); + printk("%s: freeatom %u, atomgen %u\n", + __func__, sbi->freeatom, sbi->atomgen); + + if (sbi->blocksize != blocksize) { + if (!sb_set_blocksize(sb, sbi->blocksize)) { + printk(KERN_ERR "TUX3: blocksize too small for device.\n"); + goto error; + } + } + printk("%s: s_blocksize %lu\n", __func__, sb->s_blocksize); + +// struct inode *vtable; + sbi->bitmap = tux3_iget(sb, TUX_BITMAP_INO); + err = PTR_ERR(sbi->bitmap); + if (IS_ERR(sbi->bitmap)) + goto error; + + sbi->rootdir = tux3_iget(sb, TUX_ROOTDIR_INO); + err = PTR_ERR(sbi->rootdir); + if (IS_ERR(sbi->rootdir)) + goto error_bitmap; + + sbi->atable = tux3_iget(sb, TUX_ATABLE_INO); + err = PTR_ERR(sbi->atable); + if (IS_ERR(sbi->atable)) + goto error_rootdir; + + err = -ENOMEM; + sb->s_root = d_alloc_root(sbi->rootdir); + if (!sb->s_root) + goto error_atable; + + return 0; + +error_atable: + iput(sbi->atable); +error_rootdir: + iput(sbi->rootdir); +error_bitmap: + iput(sbi->bitmap); +error: + kfree(sbi); + return err; +} + +static int tux3_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, struct vfsmount *mnt) +{ + return get_sb_bdev(fs_type, flags, dev_name, data, tux3_fill_super, mnt); +} + +static struct file_system_type tux3_fs_type = { + .owner = THIS_MODULE, + .name = "tux3", + .fs_flags = FS_REQUIRES_DEV, + .get_sb = tux3_get_sb, + .kill_sb = kill_block_super, +}; + +static int __init init_tux3(void) +{ + int err = tux3_init_inodecache(); + if (err) + return err; + return register_filesystem(&tux3_fs_type); +} + +static void __exit exit_tux3(void) +{ + unregister_filesystem(&tux3_fs_type); + tux3_destroy_inodecache(); +} + +module_init(init_tux3) +module_exit(exit_tux3) +MODULE_LICENSE("GPL"); +#endif /* !__KERNEL__ */ diff --git a/fs/tux3/trace.h b/fs/tux3/trace.h new file mode 100644 index 0000000..d736466 --- /dev/null +++ b/fs/tux3/trace.h @@ -0,0 +1,25 @@ +#ifndef TRACE_H +#define TRACE_H + +#define logline(caller, fmt, args...) do { \ + printf("%s: ", caller); \ + printf(fmt , ##args); \ + printf("\n"); \ +} while (0) + +#define error(fmt, args...) ({ warn(fmt "!" , ##args); die(99); 1; }) +#define assert(expr) do { if (!(expr)) error("Failed assertion \"%s\"", #expr); } while (0) +#define warn(fmt, args...) do { logline(__func__, fmt , ##args); } while (0) +#define trace_off(...) do {} while (0) +#define trace_on(fmt, args...) warn(fmt , ##args) + +/* + * FIXME: this may want to change behavior by mount option. + * NOTE: don't assume this calls die(). + */ +#define tux_error(sb, fmt, args...) do { \ + warn(fmt , ##args); \ + die(100); \ +} while (0) + +#endif diff --git a/fs/tux3/tux3.h b/fs/tux3/tux3.h new file mode 100644 index 0000000..b601810 --- /dev/null +++ b/fs/tux3/tux3.h @@ -0,0 +1,725 @@ +#ifndef TUX3_H +#define TUX3_H + +#ifdef __KERNEL__ +#include +#include +#include +#include +#include + +typedef loff_t block_t; + +#define printf printk +#define vprintf vprintk +#define die(code) BUG_ON(1) + +#include "trace.h" +#endif + +typedef long long L; // widen for printf on 64 bit systems + +#define PACKED __attribute__ ((packed)) +#define fieldtype(compound, field) typeof(((compound *)NULL)->field) +#define vecset(d, v, n) memset((d), (v), (n) * sizeof(*(d))) +#define veccopy(d, s, n) memcpy((d), (s), (n) * sizeof(*(d))) +#define vecmove(d, s, n) memmove((d), (s), (n) * sizeof(*(d))) + +typedef u64 fixed32; /* Tux3 time values */ +typedef u32 millisecond_t; +typedef u64 inum_t; +typedef u64 tuxkey_t; + +#ifdef __KERNEL__ +/* Endian support */ +typedef __be16 be_u16; +typedef __be32 be_u32; +typedef __be64 be_u64; + +static inline u16 from_be_u16(be_u16 val) +{ + return be16_to_cpu(val); +} + +static inline u32 from_be_u32(be_u32 val) +{ + return be32_to_cpu(val); +} + +static inline u64 from_be_u64(be_u64 val) +{ + return be64_to_cpu(val); +} + +static inline be_u16 to_be_u16(u16 val) +{ + return cpu_to_be16(val); +} + +static inline be_u32 to_be_u32(u32 val) +{ + return cpu_to_be32(val); +} + +static inline be_u64 to_be_u64(u64 val) +{ + return cpu_to_be64(val); +} +#endif /* !__KERNEL__ */ + +static inline void *encode16(void *at, unsigned val) +{ + *(be_u16 *)at = to_be_u16(val); + return at + sizeof(u16); +} + +static inline void *encode32(void *at, unsigned val) +{ + *(be_u32 *)at = to_be_u32(val); + return at + sizeof(u32); +} + +static inline void *encode64(void *at, u64 val) +{ + *(be_u64 *)at = to_be_u64(val); + return at + sizeof(u64); +} + +static inline void *encode48(void *at, u64 val) +{ + at = encode16(at, val >> 32); + return encode32(at, val); +} + +static inline void *decode16(void *at, unsigned *val) +{ + *val = from_be_u16(*(be_u16 *)at); + return at + sizeof(u16); +} + +static inline void *decode32(void *at, unsigned *val) +{ + *val = from_be_u32(*(be_u32 *)at); + return at + sizeof(u32); +} + +static inline void *decode64(void *at, u64 *val) +{ + *val = from_be_u64(*(be_u64 *)at); + return at + sizeof(u64); +} + +static inline void *decode48(void *at, u64 *val) +{ + unsigned part1, part2; + at = decode16(at, &part1); + at = decode32(at, &part2); + *val = (u64)part1 << 32 | part2; + return at; +} + +/* Tux3 disk format */ +#define SB_MAGIC_SIZE 8 +#define SB_MAGIC { 't', 'u', 'x', '3', 0xdd, 0x08, 0x12, 0x12 } /* date of latest incompatible sb format */ +/* + * disk format revision history + * !!! always update this for every incompatible change !!! + * + * 2008-08-06: Beginning of time + * 2008-09-06: Actual checking starts + * 2008-12-12: Atom dictionary size in disksuper instead of atable->i_size + */ + +#define MAX_INODES_BITS 48 +#define MAX_BLOCKS_BITS 48 +#define MAX_FILESIZE_BITS 60 +#define MAX_FILESIZE (1LL << MAX_FILESIZE_BITS) +#define MAX_EXTENT (1 << 6) +#define SB_LOC (1 << 12) + +/* Special inode numbers */ +#define TUX_BITMAP_INO 0 +#define TUX_INVALID_INO 1 +#define TUX_VTABLE_INO 2 +#define TUX_ATABLE_INO 10 +#define TUX_ROOTDIR_INO 13 + +struct disksuper +{ + /* Update magic on any incompatible format change */ + char magic[SB_MAGIC_SIZE]; + be_u64 birthdate; /* Volume creation date */ + be_u64 flags; /* Need to assign some flags */ + be_u64 iroot; /* Root of the inode table btree */ + be_u64 aroot; /* The atime table is a file now, delete on next format rev */ + be_u16 blockbits; /* Shift to get volume block size */ + be_u16 unused1; /* Throw away on next format rev */ + be_u32 unused2; /* Throw away on next format rev */ + be_u64 volblocks; /* Volume size */ + /* The rest should be moved to a "metablock" that is updated frequently */ + be_u64 freeblocks; /* Should match total of zero bits in allocation bitmap */ + be_u64 nextalloc; /* Get rid of this when we have a real allocation policy */ + be_u32 freeatom; /* Beginning of persistent free atom list in atable */ + be_u32 atomgen; /* Next atom number if there are no free atoms */ + be_u64 dictsize; /* Size of the atom dictionary instead if i_size */ +}; + +struct root { + unsigned depth; /* btree levels not including leaf level */ + block_t block; /* disk location of btree root */ +}; + +struct btree { + struct rw_semaphore lock; + struct sb *sb; /* Convenience to reduce parameter list size */ + struct btree_ops *ops; /* Generic btree low level operations */ + struct root root; /* Cached description of btree root */ + u16 entries_per_leaf; /* Used in btree leaf splitting */ +}; + +/* Define layout of btree root on disk, endian conversion is elsewhere. */ + +static inline u64 pack_root(struct root *root) +{ + return (u64)root->depth << 48 | root->block; +} + +static inline struct root unpack_root(u64 v) +{ + return (struct root){ .depth = v >> 48, .block = v & (-1ULL >> 16), }; +} + +/* Path cursor for btree traversal */ + +struct cursor { + struct btree *btree; +#define CURSOR_DEBUG +#ifdef CURSOR_DEBUG +#define FREE_BUFFER ((void *)0xdbc06505) +#define FREE_NEXT ((void *)0xdbc06507) + int maxlen; +#endif + int len; + struct path_level { + struct buffer_head *buffer; + struct index_entry *next; + } path[]; +}; + +/* Tux3-specific sb is a handle for the entire volume state */ + +struct sb { + struct disksuper super; + struct btree itable; /* Cached root of the inode table */ + struct inode *bitmap; /* allocation bitmap special file */ + struct inode *rootdir; /* root directory special file */ + struct inode *vtable; /* version table special file */ + struct inode *atable; /* xattr atom special file */ + unsigned blocksize, blockbits, blockmask; + block_t volblocks, freeblocks, nextalloc; + unsigned entries_per_node; /* must be per-btree type, get rid of this */ + unsigned max_inodes_per_block; /* get rid of this and use entries per leaf */ + unsigned version; /* Currently mounted volume version view */ + unsigned atomref_base, unatom_base; /* layout of atom table */ + unsigned freeatom; /* Start of free atom list in atom table */ + unsigned atomgen; /* Next atom number to allocate if no free atoms */ + loff_t dictsize; /* Atom dictionary size */ +#ifdef __KERNEL__ + struct super_block *vfs_sb; /* Generic kernel superblock */ +#else + map_t *devmap; /* Userspace device block cache */ +#endif +}; + +#ifdef __KERNEL__ +/* + * In kernel an inode has a generic part and a filesystem-specific part + * with conversions between them, in order to support multiple different + * kinds of filesystem. In userspace there is only one kind of filesystem, + * Tux3, so no need for two kinds of inodes, and a big pain to initialize + * inodes for testing if there were. We use a typedef here so that the + * filesystem-specific type can be transparently aliased to the generic + * inode type in userspace and be a separate type in kernel. + */ + +typedef struct { + struct btree btree; + inum_t inum; /* Inode number. Fixme: also in generic inode */ + unsigned present; /* Attributes decoded from or to be encoded to inode table */ + struct xcache *xcache; /* Extended attribute cache */ + struct inode vfs_inode; /* Generic kernel inode */ +} tuxnode_t; + +static inline struct sb *tux_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct super_block *vfs_sb(struct sb *sb) +{ + return sb->vfs_sb; +} + +static inline tuxnode_t *tux_inode(struct inode *inode) +{ + return container_of(inode, tuxnode_t, vfs_inode); +} + +typedef struct address_space map_t; + +static inline map_t *mapping(struct inode *inode) +{ + return inode->i_mapping; +} + +static inline void *malloc(size_t size) +{ + might_sleep(); + return kmalloc(size, GFP_NOFS); +} + +static inline void free(void *ptr) +{ + kfree(ptr); +} +#else +typedef struct inode { + struct btree btree; + inum_t inum; + unsigned present; + struct xcache *xcache; + struct sb *i_sb; + map_t *map; + loff_t i_size; + unsigned i_version; + struct timespec i_mtime, i_ctime, i_atime; + unsigned i_mode, i_uid, i_gid, i_nlink; + struct mutex i_mutex; + dev_t i_rdev; +} tuxnode_t; + +struct file { + struct inode *f_inode; + unsigned f_version; + loff_t f_pos; +}; + +static inline struct sb *tux_sb(struct sb *sb) +{ + return sb; +} + +static inline struct sb *vfs_sb(struct sb *sb) +{ + return sb; +} + +static inline struct inode *tux_inode(struct inode *inode) +{ + return inode; +} + +static inline map_t *mapping(struct inode *inode) +{ + return inode->map; +} +#endif /* !__KERNEL__ */ + +#define TUX_LINK_MAX 64 /* just for debug for now */ + +#define TUX_NAME_LEN 255 + +/* directory entry */ +typedef struct { + be_u64 inum; + be_u16 rec_len; + u8 name_len, type; + char name[]; +} tux_dirent; + +/* version:10, count:6, block:48 */ +struct diskextent { be_u64 block_count_version; }; +#define MAX_GROUP_ENTRIES 255 +/* count:8, keyhi:24 */ +struct group { be_u32 count_and_keyhi; }; +/* limit:8, keylo:24 */ +struct entry { be_u32 limit_and_keylo; }; +struct dleaf { be_u16 magic, groups, free, used; struct diskextent table[]; }; + +struct dwalk { + struct dleaf *leaf; + struct group *group, *gstop, *gdict; + struct entry *entry, *estop; + struct diskextent *exbase, *extent, *exstop; + struct { + struct group group; + struct entry entry; + int used, free, groups; + } mock; +}; + +/* group wrappers */ + +static inline struct group make_group(tuxkey_t keyhi, unsigned count) +{ + return (struct group){ to_be_u32(keyhi | (count << 24)) }; +} + +static inline unsigned group_keyhi(struct group *group) +{ + return from_be_u32(*(be_u32 *)group) & 0xffffff; +} + +static inline unsigned group_count(struct group *group) +{ + return *(unsigned char *)group; +} + +static inline void set_group_count(struct group *group, int n) +{ + *(unsigned char *)group = n; +} + +static inline void inc_group_count(struct group *group, int n) +{ + *(unsigned char *)group += n; +} + +/* entry wrappers */ + +static inline struct entry make_entry(tuxkey_t keylo, unsigned limit) +{ + return (struct entry){ to_be_u32(keylo | (limit << 24)) }; +} + +static inline unsigned entry_keylo(struct entry *entry) +{ + return from_be_u32(*(be_u32 *)entry) & ~(-1 << 24); +} + +static inline unsigned entry_limit(struct entry *entry) +{ + return *(unsigned char *)entry; +} + +static inline void inc_entry_limit(struct entry *entry, int n) +{ + *(unsigned char *)entry += n; +} + +/* extent wrappers */ + +static inline struct diskextent make_extent(block_t block, unsigned count) +{ + assert(block < (1ULL << 48) && count - 1 < (1 << 6)); + return (struct diskextent){ to_be_u64(((u64)(count - 1) << 48) | block) }; +} + +static inline block_t extent_block(struct diskextent extent) +{ + return from_be_u64(*(be_u64 *)&extent) & ~(-1LL << 48); +} + +static inline unsigned extent_count(struct diskextent extent) +{ + return ((from_be_u64(*(be_u64 *)&extent) >> 48) & 0x3f) + 1; +} + +static inline unsigned extent_version(struct diskextent extent) +{ + return from_be_u64(*(be_u64 *)&extent) >> 54; +} + +/* dleaf wrappers */ + +static inline unsigned dleaf_groups(struct dleaf *leaf) +{ + return from_be_u16(leaf->groups); +} + +static inline void set_dleaf_groups(struct dleaf *leaf, int n) +{ + leaf->groups = to_be_u16(n); +} + +static inline void inc_dleaf_groups(struct dleaf *leaf, int n) +{ + leaf->groups = to_be_u16(from_be_u16(leaf->groups) + n); +} + +typedef void vleaf; + +struct btree_ops { + void (*btree_init)(struct btree *btree); + int (*leaf_sniff)(struct btree *btree, vleaf *leaf); + int (*leaf_init)(struct btree *btree, vleaf *leaf); + tuxkey_t (*leaf_split)(struct btree *btree, tuxkey_t key, vleaf *from, vleaf *into); + void *(*leaf_resize)(struct btree *btree, tuxkey_t key, vleaf *leaf, unsigned size); + void (*leaf_dump)(struct btree *btree, vleaf *leaf); + unsigned (*leaf_need)(struct btree *btree, vleaf *leaf); + unsigned (*leaf_free)(struct btree *btree, vleaf *leaf); + /* return value: 1 - modified, 0 - not modified, < 0 - error */ + int (*leaf_chop)(struct btree *btree, tuxkey_t key, vleaf *leaf); + void (*leaf_merge)(struct btree *btree, vleaf *into, vleaf *from); + block_t (*balloc)(struct sb *sb, unsigned blocks); + void (*bfree)(struct sb *sb, block_t block, unsigned blocks); +}; + +/* + * Tux3 times are 32.32 fixed point while time attributes are stored in 32.16 + * format, trading away some precision to compress time fields by two bytes + * each. It is not clear whether the saved space is worth the lower precision. + */ +#define TIME_ATTR_SHIFT 16 + +static inline u32 high32(fixed32 val) +{ + return val >> 32; +} + +static inline unsigned billionths(fixed32 val) +{ + return (((val & 0xffffffff) * 1000000000) + 0x80000000) >> 32; +} + +static inline struct timespec spectime(fixed32 time) +{ + return (struct timespec){ .tv_sec = high32(time), .tv_nsec = billionths(time) }; +} + +static inline fixed32 tuxtime(struct timespec time) +{ + return ((u64)time.tv_sec << 32) + ((time.tv_nsec * ((1ULL << 63) / 1000000000)) >> 31); +} + +static inline struct timespec gettime(void) +{ +#ifdef __KERNEL__ + return current_kernel_time(); +#else + struct timeval now; + gettimeofday(&now, NULL); + return (struct timespec){ .tv_sec = now.tv_sec, .tv_nsec = now.tv_usec * 1000 }; +#endif +} + +struct tux_iattr { + unsigned mode, uid, gid; +}; + +void hexdump(void *data, unsigned size); +block_t balloc(struct sb *sb, unsigned blocks); +void bfree(struct sb *sb, block_t start, unsigned blocks); + +enum atkind { + MIN_ATTR = 6, + MODE_OWNER_ATTR = 6, + DATA_BTREE_ATTR = 7, + CTIME_SIZE_ATTR = 8, + LINK_COUNT_ATTR = 9, + MTIME_ATTR = 10, + IDATA_ATTR = 11, + XATTR_ATTR = 12, + MAX_ATTRS, + VAR_ATTRS = IDATA_ATTR +}; + +enum atbit { + MODE_OWNER_BIT = 1 << MODE_OWNER_ATTR, + CTIME_SIZE_BIT = 1 << CTIME_SIZE_ATTR, + DATA_BTREE_BIT = 1 << DATA_BTREE_ATTR, + LINK_COUNT_BIT = 1 << LINK_COUNT_ATTR, + MTIME_BIT = 1 << MTIME_ATTR, + IDATA_BIT = 1 << IDATA_ATTR, + XATTR_BIT = 1 << XATTR_ATTR, +}; + +extern unsigned atsize[MAX_ATTRS]; + +#ifndef ENOATTR +#define ENOATTR ENOENT +#endif + +#ifndef XATTR_CREATE +#define XATTR_CREATE 1 // fail if xattr already exists +#define XATTR_REPLACE 2 // fail if xattr does not exist +#endif + +struct xattr { u16 atom, size; char body[]; }; +struct xcache { u16 size, maxsize; struct xattr xattrs[]; }; + +static inline struct xattr *xcache_next(struct xattr *xattr) +{ + return (void *)xattr->body + xattr->size; +} + +static inline struct xattr *xcache_limit(struct xcache *xcache) +{ + return (void *)xcache + xcache->size; +} + +static inline void *encode_kind(void *attrs, unsigned kind, unsigned version) +{ + return encode16(attrs, (kind << 12) | version); +} + +struct ileaf; +static inline struct ileaf *to_ileaf(vleaf *leaf) +{ + return leaf; +} + +/* for tree_chop */ +struct delete_info { + tuxkey_t key; + block_t blocks, freed; + block_t resume; + int create; +}; + +#ifdef __KERNEL__ +static inline void *bufdata(struct buffer_head *buffer) +{ + return buffer->b_data; +} + +static inline size_t bufsize(struct buffer_head *buffer) +{ + return buffer->b_size; +} + +static inline block_t bufindex(struct buffer_head *buffer) +{ + return buffer->b_blocknr; +} + +static inline int bufcount(struct buffer_head *buffer) +{ + return atomic_read(&buffer->b_count); +} + +#define bufmap(map) NULL // just ignore this until we have peekblk + +static inline struct inode *buffer_inode(struct buffer_head *buffer) +{ + return buffer->b_page->mapping->host; +} + +/* btree.c */ +struct buffer_head *cursor_leafbuf(struct cursor *cursor); +void release_cursor(struct cursor *cursor); +struct cursor *alloc_cursor(struct btree *btree, int); +void free_cursor(struct cursor *cursor); +void level_pop_brelse_dirty(struct cursor *cursor); +void level_push(struct cursor *cursor, struct buffer_head *buffer, struct index_entry *next); + +void init_btree(struct btree *btree, struct sb *sb, struct root root, struct btree_ops *ops); +int new_btree(struct btree *btree, struct sb *sb, struct btree_ops *ops); +struct buffer_head *new_leaf(struct btree *btree); +int probe(struct btree *btree, tuxkey_t key, struct cursor *cursor); +int advance(struct btree *btree, struct cursor *cursor); +tuxkey_t next_key(struct cursor *cursor, int depth); +int tree_chop(struct btree *btree, struct delete_info *info, millisecond_t deadline); +int btree_insert_leaf(struct cursor *cursor, tuxkey_t key, struct buffer_head *leafbuf); +int btree_leaf_split(struct btree *btree, struct cursor *cursor, tuxkey_t key); +void *tree_expand(struct btree *btree, tuxkey_t key, unsigned newsize, struct cursor *cursor); +void show_tree_range(struct btree *btree, tuxkey_t start, unsigned count); +void show_tree(struct btree *btree); + +/* dir.c */ +int tux_update_entry(struct buffer_head *buffer, tux_dirent *entry, inum_t inum, unsigned mode); +loff_t tux_create_entry(struct inode *dir, const char *name, int len, inum_t inum, unsigned mode); +tux_dirent *tux_find_entry(struct inode *dir, const char *name, int len, struct buffer_head **result); +int tux_delete_entry(struct buffer_head *buffer, tux_dirent *entry); +int tux_readdir(struct file *file, void *state, filldir_t filldir); +int tux_dir_is_empty(struct inode *dir); +extern const struct file_operations tux_dir_fops; +extern const struct inode_operations tux_dir_iops; + +/* dtree.c */ +int dleaf_init(struct btree *btree, vleaf *leaf); +unsigned dleaf_free(struct btree *btree, vleaf *leaf); +void dleaf_dump(struct btree *btree, vleaf *vleaf); +int dleaf_split_at(vleaf *from, vleaf *into, struct entry *entry, unsigned blocksize); +void dleaf_merge(struct btree *btree, vleaf *vinto, vleaf *vfrom); +unsigned dleaf_need(struct btree *btree, vleaf *vleaf); +extern struct btree_ops dtree_ops; + +int dwalk_end(struct dwalk *walk); +block_t dwalk_block(struct dwalk *walk); +unsigned dwalk_count(struct dwalk *walk); +tuxkey_t dwalk_index(struct dwalk *walk); +int dwalk_next(struct dwalk *walk); +int dwalk_back(struct dwalk *walk); +int dwalk_probe(struct dleaf *leaf, unsigned blocksize, struct dwalk *walk, tuxkey_t key); +int dwalk_mock(struct dwalk *walk, tuxkey_t index, struct diskextent extent); +void dwalk_copy(struct dwalk *walk, struct dleaf *dest); +void dwalk_chop(struct dwalk *walk); +int dwalk_add(struct dwalk *walk, tuxkey_t index, struct diskextent extent); + +/* filemap.c */ +int tux3_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +extern const struct address_space_operations tux_aops; +extern const struct address_space_operations tux_blk_aops; + +/* iattr.c */ +unsigned encode_asize(unsigned bits); +void dump_attrs(struct inode *inode); +void *encode_attrs(struct inode *inode, void *attrs, unsigned size); +void *decode_attrs(struct inode *inode, void *attrs, unsigned size); + +/* ileaf.c */ +void *ileaf_lookup(struct btree *btree, inum_t inum, struct ileaf *leaf, unsigned *result); +inum_t find_empty_inode(struct btree *btree, struct ileaf *leaf, inum_t goal); +int ileaf_purge(struct btree *btree, inum_t inum, struct ileaf *leaf); +extern struct btree_ops itable_ops; + +/* inode.c */ +void tux3_delete_inode(struct inode *inode); +void tux3_clear_inode(struct inode *inode); +int tux3_write_inode(struct inode *inode, int do_sync); +int tux3_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); +struct inode *tux_create_inode(struct inode *dir, int mode, dev_t rdev); +struct inode *tux3_iget(struct super_block *sb, inum_t inum); +int tux3_setattr(struct dentry *dentry, struct iattr *iattr); + +/* symlink.c */ +extern const struct inode_operations tux_symlink_iops; + +/* xattr.c */ +int xcache_dump(struct inode *inode); +struct xcache *new_xcache(unsigned maxsize); +int get_xattr(struct inode *inode, const char *name, unsigned len, void *data, unsigned size); +int set_xattr(struct inode *inode, const char *name, unsigned len, const void *data, unsigned size, unsigned flags); +void *encode_xattrs(struct inode *inode, void *attrs, unsigned size); +unsigned decode_xsize(struct inode *inode, void *attrs, unsigned size); +unsigned encode_xsize(struct inode *inode); + +/* temporary hack for buffer */ +struct buffer_head *blockread(struct address_space *mapping, block_t iblock); +struct buffer_head *blockget(struct address_space *mapping, block_t iblock); + +static inline int buffer_empty(struct buffer_head *buffer) +{ + return 1; +} + +static inline struct buffer_head *set_buffer_empty(struct buffer_head *buffer) +{ + return buffer; +} + +static inline void brelse_dirty(struct buffer_head *buffer) +{ + mark_buffer_dirty(buffer); + brelse(buffer); +} +#else /* !__KERNEL__ */ +static inline struct inode *buffer_inode(struct buffer_head *buffer) +{ + return buffer->map->inode; +} +#endif /* !__KERNEL__ */ + +#endif diff --git a/fs/tux3/xattr.c b/fs/tux3/xattr.c new file mode 100644 index 0000000..585d3ca --- /dev/null +++ b/fs/tux3/xattr.c @@ -0,0 +1,504 @@ +/* + * Tux3 versioning filesystem in user space + * + * Original copyright (c) 2008 Daniel Phillips + * Licensed under the GPL version 3 + * + * By contributing changes to this file you grant the original copyright holder + * the right to distribute those changes under any license. + */ + +#include "tux3.h" + +#ifndef trace +#define trace trace_on +#endif + +/* Xattr Atoms */ + +/* + * Atom count table: + * + * * Both tables are mapped into the atom table at a high logical offset. + * Allowing 32 bits worth of atom numbers, and with at most 256 atom entries + * per 4K dirent block, we need about (32 << 8) = 1 TB dirent bytes for the + * atom dictionary, so the refcount tables start at block 2^40 >> 12 = 2^28. + * + * * The refcount table consists of pairs of blocks: even blocks with the low + * 16 bits of refcount and odd blocks with the high 16 bits. For 2^32 atoms + * that is 2^34 bytes at most, or 2^22 4K blocks. + * + * Atom reverse map: + * + * * When a new atom dirent is created we also set the reverse map for the + * dirent's atom number to the file offset at which the dirent was created. + * This will be 64 bits just to be lazy so that is 2^32 atoms * 8 bytes + * = 2^35 revmap bytes = 2^23 4K blocks. This starts just above the count + * table, which puts it at logical offset 2^28 + 2^23, leaving a gap after + * the count table in case we decide 32 bits of ref count is not enough. + */ + +typedef u32 atom_t; + +static inline atom_t entry_atom(tux_dirent *entry) +{ + return from_be_u64(entry->inum); +} + +static struct buffer_head *blockread_unatom(struct inode *atable, atom_t atom, unsigned *offset) +{ + unsigned shift = tux_sb(atable->i_sb)->blockbits - 3; + *offset = atom & ~(-1 << shift); + return blockread(mapping(atable), tux_sb(atable->i_sb)->unatom_base + (atom >> shift)); +} + +static int unatom(struct inode *atable, atom_t atom, char *name, unsigned size) +{ + unsigned offset; + struct sb *sb = tux_sb(atable->i_sb); + struct buffer_head *buffer = blockread_unatom(atable, atom, &offset); + if (!buffer) + return -ENOMEM; + u64 where = from_be_u64(((be_u64 *)bufdata(buffer))[offset]); + brelse(buffer); + buffer = blockread(mapping(atable), where >> sb->blockbits); + if (!buffer) + return -ENOMEM; + tux_dirent *entry = bufdata(buffer) + (where & sb->blockmask); + if (entry_atom(entry) != atom) { + warn("atom %x reverse entry broken", atom); + return -EINVAL; + } + unsigned len = entry->name_len; + if (size) { + if (len > size) { + brelse(buffer); + return -ERANGE; + } + memcpy(name, entry->name, len); + } + brelse(buffer); + return len; +} + +/* userland only */ +void dump_atoms(struct inode *atable) +{ + struct sb *sb = tux_sb(atable->i_sb); + unsigned blocks = (sb->atomgen + (sb->blockmask >> 1)) >> (sb->blockbits - 1); + for (unsigned j = 0; j < blocks; j++) { + unsigned block = sb->atomref_base + 2 * j; + struct buffer_head *lobuf, *hibuf; + if (!(lobuf = blockread(mapping(atable), block))) + goto eek; + if (!(hibuf = blockread(mapping(atable), block))) + goto eek; + be_u16 *lorefs = bufdata(lobuf), *hirefs = bufdata(hibuf); + for (unsigned i = 0; i < (sb->blocksize >> 1); i++) { + unsigned refs = (from_be_u16(hirefs[i]) << 16) + from_be_u16(lorefs[i]); + if (!refs) + continue; + atom_t atom = i; + char name[100]; + int len = unatom(atable, atom, name, sizeof(name)); + if (len < 0) + goto eek; + printf("%.*s = %x\n", len, name, atom); + } + brelse(lobuf); + brelse(hibuf); + } + return; +eek: + warn("atom name lookup failed"); + return; +} + +/* userland only */ +void show_freeatoms(struct sb *sb) +{ + struct inode *atable = sb->atable; + atom_t atom = sb->freeatom; + while (atom) { + warn("free atom: %Lx", (L)atom); + unsigned offset; + struct buffer_head *buffer = blockread_unatom(atable, atom, &offset); + if (!buffer) + goto eek; + u64 next = from_be_u64(((be_u64 *)bufdata(buffer))[offset]); + if ((next >> 48) != 0xdead) + goto eek; + atom = next & ~(-1LL << 48); + brelse(buffer); + } + return; +eek: + warn("eek"); +} + +static atom_t get_freeatom(struct inode *atable) +{ + struct sb *sb = tux_sb(atable->i_sb); + atom_t atom = sb->freeatom; + if (!atom) + return sb->atomgen++; + unsigned offset; + struct buffer_head *buffer = blockread_unatom(atable, atom, &offset); + if (!buffer) + goto eek; + u64 next = from_be_u64(((be_u64 *)bufdata(buffer))[offset]); + brelse(buffer); + if ((next >> 48) != 0xdead) + goto eek; + sb->freeatom = next & ~(-1LL << 48); + return atom; +eek: + warn("something horrible happened"); + return -1; +} + +static int use_atom(struct inode *atable, atom_t atom, int use) +{ + struct sb *sb = tux_sb(atable->i_sb); + unsigned shift = sb->blockbits - 1; + unsigned block = sb->atomref_base + 2 * (atom >> shift); + unsigned offset = atom & ~(-1 << shift), kill = 0; + struct buffer_head *buffer; + if (!(buffer = blockread(mapping(atable), block))) + return -EIO; + int low = from_be_u16(((be_u16 *)bufdata(buffer))[offset]) + use; + trace("inc atom %x by %i, offset %x[%x], low = %i", atom, use, block, offset, low); + ((be_u16 *)bufdata(buffer))[offset] = to_be_u16(low); + if (!low || (low & (-1 << 16))) { + brelse_dirty(buffer); + if (!(buffer = blockread(mapping(atable), block + 1))) + return -EIO; + int high = from_be_u16(((be_u16 *)bufdata(buffer))[offset]) + (low >> 16); + trace("carry %i, offset %x[%x], high = %i", low >> 16, block, offset, high); + ((be_u16 *)bufdata(buffer))[offset] = to_be_u16(high); + kill = !(low | high); + } + brelse_dirty(buffer); + if (kill) { + warn("delete atom %Lx", (L) atom); + buffer = blockread_unatom(atable, atom, &offset); + if (!buffer) + return -1; // better set a flag that unatom broke or something!!! + u64 where = from_be_u64(((be_u64 *)bufdata(buffer))[offset]); + brelse(buffer); + ((be_u64 *)bufdata(buffer))[offset] = to_be_u64((u64)sb->freeatom | (0xdeadLL << 48)); + sb->freeatom = atom; + buffer = blockread(mapping(atable), where >> sb->blockbits); + tux_dirent *entry = bufdata(buffer) + (where & sb->blockmask); + if (entry_atom(entry) == atom) + tux_delete_entry(buffer, entry); + else { + warn("atom entry not found"); + brelse(buffer); + } + } + return 0; +} + +// bug waiting to happen... +tux_dirent *_tux_find_entry(struct inode *dir, const char *name, int len, struct buffer_head **result, loff_t size); +loff_t _tux_create_entry(struct inode *dir, const char *name, int len, inum_t inum, unsigned mode, loff_t *size); + +static atom_t find_atom(struct inode *atable, const char *name, unsigned len) +{ + struct buffer_head *buffer; + tux_dirent *entry = _tux_find_entry(atable, name, len, &buffer, tux_sb(atable->i_sb)->dictsize); + if (IS_ERR(entry)) + return -1; /* FIXME: return correct errno */ + atom_t atom = entry_atom(entry); + brelse(buffer); + return atom; +} + +static atom_t make_atom(struct inode *atable, const char *name, unsigned len) +{ + atom_t atom = find_atom(atable, name, len); + if (atom != -1) + return atom; + atom = get_freeatom(atable); + loff_t where = _tux_create_entry(atable, name, len, atom, 0, &tux_sb(atable->i_sb)->dictsize); + if (where < 0) + return -1; // and what about the err??? + + /* Enter into reverse map - maybe verify zero refs? */ + unsigned offset; + struct buffer_head *buffer = blockread_unatom(atable, atom, &offset); + if (!buffer) + return -1; // better set a flag that unatom broke or something!!! + ((be_u64 *)bufdata(buffer))[offset] = to_be_u64(where); + brelse_dirty(buffer); + + return atom; +} + +/* Xattr cache */ + +int xcache_dump(struct inode *inode) +{ + if (!tux_inode(inode)->xcache) + return 0; + //warn("xattrs %p/%i", inode->xcache, inode->xcache->size); + struct xcache *xcache = tux_inode(inode)->xcache; + struct xattr *xattr = xcache->xattrs, *limit = xcache_limit(xcache); + while (xattr < limit) { + if (xattr->size > tux_sb(inode->i_sb)->blocksize) + goto bail; + printf("atom %.3x => ", xattr->atom); + if (xattr->size) + hexdump(xattr->body, xattr->size); + else + printf("\n"); + if ((xattr = xcache_next(xattr)) > limit) + goto fail; + } + assert(xattr == limit); + return 0; +fail: + error("corrupt xattrs"); +bail: + error("xattr too big"); + return -1; +} + +static struct xattr *xcache_lookup(struct xcache *xcache, unsigned atom) +{ + if (xcache) { + struct xattr *xattr = xcache->xattrs; + struct xattr *limit = xcache_limit(xcache); + while (xattr < limit) { + if (xattr->atom == atom) + return xattr; + if ((xattr = xcache_next(xattr)) > limit) + return ERR_PTR(-EINVAL); + } + assert(xattr == limit); + } + return ERR_PTR(-ENOATTR); +} + +struct xcache *new_xcache(unsigned maxsize) +{ + warn("realloc xcache to %i", maxsize); + struct xcache *xcache = malloc(maxsize); + if (!xcache) + return NULL; + *xcache = (struct xcache){ .size = sizeof(struct xcache), .maxsize = maxsize }; + return xcache; +} + +static inline int remove_old(struct xcache *xcache, struct xattr *xattr) +{ + if (!xattr) + return 0; + unsigned size = (void *)xcache_next(xattr) - (void *)xattr; + memmove(xattr, xcache_next(xattr), xcache->size -= size); + return 1; +} + +/* + * Things to improve about xcache_update: + * + * * It always allocates the new attribute at the end of the list because it + * is lazy and works by always deleting the attribute first then putting + * the new one at the end + * + * * If the size of the attribute did not change, does unecessary work + * + * * Should expand by binary factor + */ +static int xcache_update(struct inode *inode, unsigned atom, const void *data, unsigned len, unsigned flags) +{ + int use = 0; + struct xcache *xcache = tux_inode(inode)->xcache; + struct xattr *xattr = xcache_lookup(xcache, atom); + if (IS_ERR(xattr)) { + if (PTR_ERR(xattr) != -ENOATTR || (flags & XATTR_REPLACE)) + return PTR_ERR(xattr); + } else { + if (flags & XATTR_CREATE) + return -EEXIST; + use -= remove_old(xcache, xattr); + } + + /* Insert new */ + unsigned more = sizeof(*xattr) + len; + if (!xcache || xcache->size + more > xcache->maxsize) { + unsigned oldsize = xcache ? xcache->size : sizeof(struct xcache); + unsigned maxsize = xcache ? xcache->maxsize : (1 << 7); + unsigned newsize = oldsize + (more < maxsize ? maxsize : more); + struct xcache *newcache = new_xcache(newsize); + if (!newcache) + return -ENOMEM; + if (xcache) { + memcpy(newcache->xattrs, xcache->xattrs, oldsize - sizeof(struct xcache)); + newcache->size = oldsize; + free(xcache); + } + tux_inode(inode)->xcache = newcache; + } + xattr = xcache_limit(tux_inode(inode)->xcache); + //warn("expand by %i\n", more); + tux_inode(inode)->xcache->size += more; + memcpy(xattr->body, data, (xattr->size = len)); + xattr->atom = atom; + use++; + if (use) + use_atom(tux_sb(inode->i_sb)->atable, atom, use); + return 0; +} + +int get_xattr(struct inode *inode, const char *name, unsigned len, void *data, unsigned size) +{ + struct inode *atable = tux_sb(inode->i_sb)->atable; + int ret; + mutex_lock(&atable->i_mutex); + atom_t atom = find_atom(atable, name, len); + if (atom == -1) { + ret = -ENOATTR; + goto out; + } + struct xattr *xattr = xcache_lookup(tux_inode(inode)->xcache, atom); + ret = xattr->size; + if (ret < size) + memcpy(data, xattr->body, ret); + else if (size) + ret = -ERANGE; +out: + mutex_unlock(&atable->i_mutex); + return ret; +} + +int set_xattr(struct inode *inode, const char *name, unsigned len, const void *data, unsigned size, unsigned flags) +{ + struct inode *atable = tux_sb(inode->i_sb)->atable; + mutex_lock(&atable->i_mutex); + atom_t atom = make_atom(atable, name, len); + int err = (atom == -1) ? -EINVAL : + xcache_update(inode, atom, data, size, flags); + mutex_unlock(&atable->i_mutex); + return err; +} + +int del_xattr(struct inode *inode, const char *name, unsigned len) +{ + int err = 0; + struct inode *atable = tux_sb(inode->i_sb)->atable; + mutex_lock(&atable->i_mutex); + atom_t atom = find_atom(atable, name, len); + if (atom == -1) { + err = -ENOATTR; + goto out; + } + struct xcache *xcache = tux_inode(inode)->xcache; + struct xattr *xattr = xcache_lookup(xcache, atom); + if (IS_ERR(xattr)) { + err = PTR_ERR(xattr); + goto out; + } + int used = remove_old(xcache, xattr); + if (used) + use_atom(atable, atom, -used); +out: + mutex_unlock(&atable->i_mutex); + return err; +} + +int xattr_list(struct inode *inode, char *text, size_t size) +{ + if (!tux_inode(inode)->xcache) + return 0; + struct inode *atable = tux_sb(inode->i_sb)->atable; + mutex_lock(&atable->i_mutex); + struct xcache *xcache = tux_inode(inode)->xcache; + struct xattr *xattr = xcache->xattrs, *limit = xcache_limit(xcache); + char *base = text, *top = text + size; + while (xattr < limit) { + atom_t atom = xattr->atom; + if (size) { + int tail = top - text; + int len = unatom(atable, atom, text, tail); + if (len < 0 || len == tail) + goto full; + *(text += len) = 0; + text++; + } else + text += unatom(atable, atom, NULL, 0) + 1; + if ((xattr = xcache_next(xattr)) > limit) + goto fail; + } + assert(xattr == limit); +full: + mutex_unlock(&atable->i_mutex); + return text - base; +fail: + mutex_unlock(&atable->i_mutex); + return -EINVAL; +} + +/* Xattr encode/decode */ + +void *encode_xattrs(struct inode *inode, void *attrs, unsigned size) +{ + if (!tux_inode(inode)->xcache) + return attrs; + struct xattr *xattr = tux_inode(inode)->xcache->xattrs; + struct xattr *xtop = xcache_limit(tux_inode(inode)->xcache); + void *limit = attrs + size - 3; + while (xattr < xtop) { + if (attrs >= limit) + break; + //immediate xattr: kind+version:16, bytes:16, atom:16, data[bytes - 2] + //printf("xattr %x/%x ", xattr->atom, xattr->size); + attrs = encode_kind(attrs, XATTR_ATTR, tux_sb(inode->i_sb)->version); + attrs = encode16(attrs, xattr->size + 2); + attrs = encode16(attrs, xattr->atom); + memcpy(attrs, xattr->body, xattr->size); + attrs += xattr->size; + xattr = xcache_next(xattr); + } + return attrs; +} + +unsigned decode_xsize(struct inode *inode, void *attrs, unsigned size) +{ + struct sb *sb = tux_sb(inode->i_sb); + unsigned total = 0, bytes; + void *limit = attrs + size; + while (attrs < limit - 1) { + unsigned head, kind; + attrs = decode16(attrs, &head); + switch ((kind = head >> 12)) { + case XATTR_ATTR: + case IDATA_ATTR: + // immediate data: kind+version:16, bytes:16, data[bytes] + // immediate xattr: kind+version:16, bytes:16, atom:16, data[bytes - 2] + attrs = decode16(attrs, &bytes); + attrs += bytes; + if ((head & 0xfff) == sb->version) + total += sizeof(struct xattr) + bytes - 2; + continue; + } + attrs += atsize[kind]; + } + return total ? total + sizeof(struct xcache) : 0; +} + +unsigned encode_xsize(struct inode *inode) +{ + if (!tux_inode(inode)->xcache) + return 0; + unsigned size = 0, xatsize = atsize[XATTR_ATTR]; + struct xattr *xattr = tux_inode(inode)->xcache->xattrs; + struct xattr *limit = xcache_limit(tux_inode(inode)->xcache); + while (xattr < limit) { + size += 2 + xatsize + xattr->size; + xattr = xcache_next(xattr); + } + assert(xattr == limit); + return size; +} diff --git a/include/linux/tux3.h b/include/linux/tux3.h new file mode 100644 index 0000000..37d1dcb --- /dev/null +++ b/include/linux/tux3.h @@ -0,0 +1,4 @@ +#ifndef LINUX_TUX3_H +#define LINUX_TUX3_H + +#endif