diff --git a/.gitignore b/.gitignore index 869e1a3..bdb2067 100644 --- a/.gitignore +++ b/.gitignore @@ -64,3 +64,8 @@ ncscope.* *.orig *~ \#*# + +# uml +include/asm-um +arch/um +linux diff --git a/fs/Kconfig b/fs/Kconfig index 2694648..a428126 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -202,6 +202,14 @@ config EXT4DEV_FS_SECURITY If you are not using a security module that requires using extended attributes for file security labels, say N. +config TUX3 + tristate "Tux3 Versioning Filesystem" + help + To compile this filesystem as a module, choose M here: the module will + be called tux3. + + If unsure, say Maybe. + config JBD tristate help diff --git a/fs/Makefile b/fs/Makefile index 1e7a11b..c59a911 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -72,6 +72,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev obj-$(CONFIG_JBD) += jbd/ obj-$(CONFIG_JBD2) += jbd2/ obj-$(CONFIG_EXT2_FS) += ext2/ +obj-$(CONFIG_TUX3) += tux3/ obj-$(CONFIG_CRAMFS) += cramfs/ obj-y += ramfs/ obj-$(CONFIG_HUGETLBFS) += hugetlbfs/ diff --git a/fs/tux3/Makefile b/fs/tux3/Makefile new file mode 100644 index 0000000..5ae93dc --- /dev/null +++ b/fs/tux3/Makefile @@ -0,0 +1,13 @@ +ifeq ($(KERNELRELEASE),) +LINUX = /lib/modules/`uname -r`/build/ + +all: + make -C $(LINUX) M=`pwd` CONFIG_TUX3=m modules +clean: + make -C $(LINUX) M=`pwd` CONFIG_TUX3=m clean +else +obj-$(CONFIG_TUX3) += tux3.o +tux3-objs += balloc.o btree.o dir.o dleaf.o filemap.o hexdump.o iattr.o \ + ileaf.o inode.o super.o xattr.o +EXTRA_CFLAGS += -std=gnu99 -Wno-declaration-after-statement +endif diff --git a/fs/tux3/balloc.c b/fs/tux3/balloc.c new file mode 100644 index 0000000..98ac7c4 --- /dev/null +++ b/fs/tux3/balloc.c @@ -0,0 +1,282 @@ +/* + * Block allocation + * + * Original copyright (c) 2008 Daniel Phillips + * Portions copyright (c) 2006-2008 Google Inc. + * Licensed under the GPL version 3 + * + * By contributing changes to this file you grant the original copyright holder + * the right to distribute those changes under any license. + */ + +#include "tux3.h" + +#ifndef trace +#define trace trace_off +#endif + +void set_bits(u8 *bitmap, unsigned start, unsigned count) +{ + unsigned limit = start + count; + unsigned lmask = (-1 << (start & 7)) & 0xff; // little endian!!! + unsigned rmask = ~(-1 << (limit & 7)) & 0xff; // little endian!!! + unsigned loff = start >> 3, roff = limit >> 3; + if (loff == roff) { + bitmap[loff] |= lmask & rmask; + return; + } + bitmap[loff] |= lmask; + memset(bitmap + loff + 1, -1, roff - loff - 1); + if (rmask) + bitmap[roff] |= rmask; +} + +void clear_bits(u8 *bitmap, unsigned start, unsigned count) +{ + unsigned limit = start + count; + unsigned lmask = (-1 << (start & 7)) & 0xff; // little endian!!! + unsigned rmask = ~(-1 << (limit & 7)) & 0xff; // little endian!!! + unsigned loff = start >> 3, roff = limit >> 3; + if (loff == roff) { + bitmap[loff] &= ~lmask | ~rmask; + return; + } + bitmap[loff] &= ~lmask; + memset(bitmap + loff + 1, 0, roff - loff - 1); + if (rmask) + bitmap[roff] &= ~rmask; +} + +int all_set(u8 *bitmap, unsigned start, unsigned count) +{ + unsigned limit = start + count; + unsigned lmask = (-1 << (start & 7)) & 0xff; // little endian!!! + unsigned rmask = ~(-1 << (limit & 7)) & 0xff; // little endian!!! + unsigned loff = start >> 3, roff = limit >> 3; + if (loff == roff) { + unsigned mask = lmask & rmask; + return (bitmap[loff] & mask) == mask; + } + for (unsigned i = loff + 1; i < roff; i++) + if (bitmap[i] != 0xff) + return 0; + return (bitmap[loff] & lmask) == lmask && + (!rmask || (bitmap[roff] & rmask) == rmask); +} + +static int bytebits(unsigned char c) +{ + unsigned count = 0; + for (; c; c >>= 1) + count += c & 1; + return count; +} + +block_t count_range(struct inode *inode, block_t start, block_t count) +{ + assert(!(start & 7)); + unsigned char ones[256]; + for (int i = 0; i < sizeof(ones); i++) + ones[i] = bytebits(i); + + block_t limit = start + count; + unsigned blocksize = 1 << tux_sb(inode->i_sb)->blockbits; + unsigned mapshift = tux_sb(inode->i_sb)->blockbits + 3; + unsigned mapmask = (1 << mapshift) - 1; + unsigned blocks = (limit + mapmask) >> mapshift; + unsigned offset = (start & mapmask) >> 3; + block_t tail = (count + 7) >> 3, total = 0; + + for (unsigned block = start >> mapshift; block < blocks; block++) { + //printf("count block %x/%x\n", block, blocks); + struct buffer_head *buffer = blockread(mapping(inode), block); + if (!buffer) + return -1; + unsigned bytes = blocksize - offset; + if (bytes > tail) + bytes = tail; + unsigned char *p = bufdata(buffer) + offset, *top = p + bytes; + while (p < top) + total += ones[*p++]; + brelse(buffer); + tail -= bytes; + offset = 0; + } + return total; +} + +block_t bitmap_dump(struct inode *inode, block_t start, block_t count) +{ + block_t limit = start + count; + unsigned blocksize = 1 << tux_sb(inode->i_sb)->blockbits; + unsigned mapshift = tux_sb(inode->i_sb)->blockbits + 3; + unsigned mapmask = (1 << mapshift) - 1; + unsigned blocks = (limit + mapmask) >> mapshift, active = 0; + unsigned offset = (start & mapmask) >> 3; + unsigned startbit = start & 7; + block_t tail = (count + startbit + 7) >> 3, begin = -1; + + printf("%i bitmap blocks:\n", blocks); + for (unsigned block = start >> mapshift; block < blocks; block++) { + int ended = 0, any = 0; + struct buffer_head *buffer = blockread(mapping(inode), block); + if (!buffer) + return -1; + unsigned bytes = blocksize - offset; + if (bytes > tail) + bytes = tail; + unsigned char *p = bufdata(buffer) + offset, *top = p + bytes; + for (; p < top; p++, startbit = 0) { + unsigned c = *p; + if (!any && c) + printf("[%x] ", block); + any |= c; + if ((!c && begin < 0) || (c == 0xff && begin >= 0)) + continue; + for (int i = startbit, mask = 1 << startbit; i < 8; i++, mask <<= 1) { + if (!(c & mask) == (begin < 0)) + continue; + block_t found = i + (((void *)p - bufdata(buffer)) << 3) + (block << mapshift); + if (begin < 0) + begin = found; + else { + if ((begin >> mapshift) != block) + printf("-%Lx ", (L)(found - 1)); + else if (begin == found - 1) + printf("%Lx ", (L)begin); + else + printf("%Lx-%Lx ", (L)begin, (L)(found - 1)); + begin = -1; + ended++; + } + } + } + active += !!any; + brelse(buffer); + tail -= bytes; + offset = 0; + if (begin >= 0) + printf("%Lx-", (L)begin); + if (any) + printf("\n"); + } + printf("(%i active)\n", active); + return -1; +} + +block_t balloc_extent_from_range(struct inode *inode, block_t start, unsigned count, unsigned blocks) +{ + trace("balloc %i blocks from [%Lx/%Lx]", blocks, (L)start, (L)count); + block_t limit = start + count; + unsigned blocksize = 1 << tux_sb(inode->i_sb)->blockbits; + unsigned mapshift = tux_sb(inode->i_sb)->blockbits + 3; + unsigned mapmask = (1 << mapshift) - 1; + unsigned mapblocks = (limit + mapmask) >> mapshift; + unsigned offset = (start & mapmask) >> 3; + unsigned startbit = start & 7; + block_t tail = (count + startbit + 7) >> 3; + for (unsigned mapblock = start >> mapshift; mapblock < mapblocks; mapblock++) { + trace_off("search mapblock %x/%x", mapblock, mapblocks); + struct buffer_head *buffer = blockread(mapping(inode), mapblock); + if (!buffer) + return -1; + unsigned bytes = blocksize - offset, run = 0; + if (bytes > tail) + bytes = tail; + unsigned char *p = bufdata(buffer) + offset, *top = p + bytes, c; + for (; p < top; p++, startbit = 0) { + if ((c = *p) == 0xff) { + run = 0; + continue; + } + for (int i = startbit, mask = 1 << startbit; i < 8; i++, mask <<= 1) { + if ((c & mask)) { + run = 0; + continue; + } + if (++run < blocks) + continue; + assert(run == blocks); + block_t found = i + (((void *)p - bufdata(buffer)) << 3) + (mapblock << mapshift); + if (found >= limit) { + assert(mapblock == mapblocks - 1); + goto final_partial_byte; + } + found -= run - 1; + set_bits(bufdata(buffer), found & mapmask, run); + brelse_dirty(buffer); + tux_sb(inode->i_sb)->nextalloc = found + run; + tux_sb(inode->i_sb)->freeblocks -= run; + //set_sb_dirty(sb); + return found; + } + } +final_partial_byte: + brelse(buffer); + tail -= bytes; + offset = 0; + } + return -1; +} + +block_t balloc_from_range(struct inode *inode, block_t start, block_t count) +{ + return balloc_extent_from_range(inode, start, count, 1); +} + +block_t balloc(SB) +{ + trace_off("balloc block at goal %Lx", (L)sb->nextalloc); + block_t goal = sb->nextalloc, total = sb->volblocks, block; + if ((block = balloc_from_range(sb->bitmap, goal, total - goal)) >= 0) + goto found; + if ((block = balloc_from_range(sb->bitmap, 0, goal)) >= 0) + goto found; + return -1; +found: + trace("balloc -> [%Lx]", (L)block); + return block; +} + +block_t balloc_extent(SB, unsigned blocks) +{ + trace_off("balloc %x blocks at goal %Lx", blocks, (L)sb->nextalloc); + block_t goal = sb->nextalloc, total = sb->volblocks, block; + if ((block = balloc_extent_from_range(sb->bitmap, goal, total - goal, blocks)) >= 0) + goto found; + if ((block = balloc_extent_from_range(sb->bitmap, 0, goal, blocks)) >= 0) + goto found; + return -1; +found: + printf("balloc extent -> [%Lx/%x]\n", (L)block, blocks); + return block; +} + +void bfree_extent(SB, block_t start, unsigned count) +{ + unsigned mapshift = sb->blockbits + 3; + unsigned mapmask = (1 << mapshift) - 1; + unsigned mapblock = start >> mapshift; + char *why = "could not read bitmap buffer"; + struct buffer_head *buffer = blockread(mapping(sb->bitmap), mapblock); + printf("free <- [%Lx]\n", (L)start); + if (!buffer) + goto eek; + if (!all_set(bufdata(buffer), start &= mapmask, count)) + goto eeek; + clear_bits(bufdata(buffer), start, count); + brelse_dirty(buffer); + sb->freeblocks += count; + //set_sb_dirty(sb); + return; +eeek: + why = "blocks already free"; + brelse(buffer); +eek: + warn("extent 0x%Lx %s!\n", (L)start, why); +} + +void bfree(SB, block_t block) +{ + bfree_extent(sb, block, 1); +} diff --git a/fs/tux3/btree.c b/fs/tux3/btree.c new file mode 100644 index 0000000..b65c97b --- /dev/null +++ b/fs/tux3/btree.c @@ -0,0 +1,519 @@ +/* + * Generic btree operations + * + * Original copyright (c) 2008 Daniel Phillips + * Portions copyright (c) 2006-2008 Google Inc. + * Licensed under the GPL version 3 + * + * By contributing changes to this file you grant the original copyright holder + * the right to distribute those changes under any license. + */ + +#include "tux3.h" + +#ifndef trace +#define trace trace_off +#endif + +struct bnode +{ + be_u32 count, unused; + struct index_entry { be_u64 key; be_u64 block; } PACKED entries[]; +} PACKED; +/* + * Note that the first key of an index block is never accessed. This is + * because for a btree, there is always one more key than nodes in each + * index node. In other words, keys lie between node pointers. I + * micro-optimize by placing the node count in the first key, which allows + * a node to contain an esthetically pleasing binary number of pointers. + * (Not done yet.) + */ + +static inline unsigned bcount(struct bnode *node) +{ + return from_be_u32(node->count); +} + +static void free_block(SB, block_t block) +{ +} + +static struct buffer_head *new_block(struct btree *btree) +{ + block_t block = (btree->ops->balloc)(btree->sb); + if (block == -1) + return NULL; + struct buffer_head *buffer = sb_getblk(vfs_sb(btree->sb), block); + if (!buffer) + return NULL; + memset(bufdata(buffer), 0, bufsize(buffer)); + return buffer; +} + +static struct buffer_head *new_leaf(struct btree *btree) +{ + struct buffer_head *buffer = new_block(btree); + if (buffer) + (btree->ops->leaf_init)(btree, bufdata(buffer)); + return buffer; +} + +static struct buffer_head *new_node(struct btree *btree) +{ + struct buffer_head *buffer = new_block(btree); + if (buffer) + ((struct bnode *)bufdata(buffer))->count = 0; + return buffer; +} + +/* + * A btree path has n + 1 entries for a btree of depth n, with the first n + * entries pointing at internal nodes and entry n + 1 pointing at a leaf. + * The next field points at the next index entry that will be loaded in a left + * to right tree traversal, not the current entry. The next pointer is null + * for the leaf, which has its own specialized traversal algorithms. + */ + +static inline struct bnode *path_node(struct tux_path path[], int level) +{ + return bufdata(path[level].buffer); +} + +void release_path(struct tux_path path[], int levels) +{ + for (int i = 0; i < levels; i++) + brelse(path[i].buffer); +} + +void show_path(struct tux_path path[], int levels) +{ + printf(">>> path %p/%i:", path, levels); + for (int i = 0; i < levels; i++) + printf(" [%Lx/%i]", (L)bufindex(path[i].buffer), bufcount(path[i].buffer)); + printf("\n"); +} + +struct tux_path *alloc_path(int levels) +{ + return malloc(sizeof(struct tux_path) * levels); +} + +void free_path(struct tux_path path[]) +{ + free(path); +} + +int probe(BTREE, tuxkey_t key, struct tux_path path[]) +{ + unsigned i, levels = btree->root.depth; + struct buffer_head *buffer = sb_bread(vfs_sb(btree->sb), btree->root.block); + if (!buffer) + return -EIO; + struct bnode *node = bufdata(buffer); + + for (i = 0; i < levels; i++) { + struct index_entry *next = node->entries, *top = next + bcount(node); + while (++next < top) /* binary search goes here */ + if (from_be_u64(next->key) > key) + break; + //printf("probe level %i, %ti of %i\n", i, next - node->entries, bcount(node)); + path[i] = (struct tux_path){ buffer, next }; + if (!(buffer = sb_bread(vfs_sb(btree->sb), from_be_u64((next - 1)->block)))) + goto eek; + node = (struct bnode *)bufdata(buffer); + } + assert((btree->ops->leaf_sniff)(btree, bufdata(buffer))); + path[levels] = (struct tux_path){ buffer }; + return 0; +eek: + release_path(path, i - 1); + return -EIO; /* stupid, it might have been NOMEM */ +} + +static inline int level_finished(struct tux_path path[], int level) +{ + struct bnode *node = path_node(path, level); + return path[level].next == node->entries + bcount(node); +} +// also write level_beginning!!! + +int advance(BTREE, struct tux_path path[]) +{ + int levels = btree->root.depth, level = levels; + struct buffer_head *buffer = path[level].buffer; + struct bnode *node; + do { + brelse(buffer); + if (!level) + return 0; + node = bufdata(buffer = path[--level].buffer); + //printf("pop to level %i, %tx of %x\n", level, path[level].next - node->entries, bcount(node)); + } while (level_finished(path, level)); + do { + //printf("push from level %i, %tx of %x\n", level, path[level].next - node->entries, bcount(node)); + if (!(buffer = sb_bread(vfs_sb(btree->sb), from_be_u64(path[level].next++->block)))) + goto eek; + path[++level] = (struct tux_path){ .buffer = buffer, .next = (node = bufdata(buffer))->entries }; + } while (level < levels); + return 1; +eek: + release_path(path, level); + return -EIO; +} + +/* + * Climb up the path until we find the first level where we have not yet read + * all the way to the end of the index block, there we find the key that + * separates the subtree we are in (a leaf) from the next subtree to the right. + */ +be_u64 *next_keyp(struct tux_path path[], int levels) +{ + for (int level = levels; level--;) + if (!level_finished(path, level)) + return &path[level].next->key; + return NULL; +} + +tuxkey_t next_key(struct tux_path path[], int levels) +{ + be_u64 *keyp = next_keyp(path, levels); + return keyp ? from_be_u64(*keyp) : -1; +} +// also write this_key!!! + +void show_tree_range(BTREE, tuxkey_t start, unsigned count) +{ + printf("%i level btree at %Li:\n", btree->root.depth, (L)btree->root.block); + struct tux_path path[30]; // check for overflow!!! + if (probe(btree, start, path)) + error("tell me why!!!"); + struct buffer_head *buffer; + do { + buffer = path[btree->root.depth].buffer; + assert((btree->ops->leaf_sniff)(btree, bufdata(buffer))); + (btree->ops->leaf_dump)(btree, bufdata(buffer)); + //tuxkey_t *next = pnext_key(path, btree->levels); + //printf("next key = %Lx:\n", next ? (L)*next : 0); + } while (--count && advance(btree, path)); +} + +/* Deletion */ + +static void brelse_free(SB, struct buffer_head *buffer) +{ + brelse(buffer); + if (bufcount(buffer)) { + warn("free block %Lx still in use!", (long long)bufindex(buffer)); + return; + } + free_block(sb, bufindex(buffer)); + set_buffer_empty(buffer); // free it!!! (and need a buffer free state) +} + +static void remove_index(struct tux_path path[], int level) +{ + struct bnode *node = path_node(path, level); + int count = bcount(node), i; + + /* stomps the node count (if 0th key holds count) */ + memmove(path[level].next - 1, path[level].next, + (char *)&node->entries[count] - (char *)path[level].next); + node->count = to_be_u32(count - 1); + --(path[level].next); + mark_buffer_dirty(path[level].buffer); + + /* no separator for last entry */ + if (level_finished(path, level)) + return; + /* + * Climb up to common parent and set separating key to deleted key. + * What if index is now empty? (no deleted key) + * Then some key above is going to be deleted and used to set sep + * Climb the path while at first entry, bail out at root + * find the node with the old sep, set it to deleted key + */ + if (path[level].next == node->entries && level) { + be_u64 sep = (path[level].next)->key; + for (i = level - 1; path[i].next - 1 == path_node(path, i)->entries; i--) + if (!i) + return; + (path[i].next - 1)->key = sep; + mark_buffer_dirty(path[i].buffer); + } +} + +static void merge_nodes(struct bnode *node, struct bnode *node2) +{ + memcpy(&node->entries[bcount(node)], &node2->entries[0], bcount(node2) * sizeof(struct index_entry)); + node->count = to_be_u32(bcount(node) + bcount(node2)); +} + +int delete_from_leaf(BTREE, vleaf *leaf, struct delete_info *info) +{ + (btree->ops->leaf_chop)(btree, info->key, leaf); + return 0; +} + +int tree_chop(BTREE, struct delete_info *info, millisecond_t deadline) +{ + int levels = btree->root.depth, level = levels - 1, suspend = 0; + struct tux_path *path, *prev; + struct buffer_head *leafbuf, *leafprev = NULL; + struct btree_ops *ops = btree->ops; + struct sb *sb = btree->sb; + + path = alloc_path(levels + 1); + prev = alloc_path(levels + 1); + memset(prev, 0, sizeof(struct tux_path) * (levels + 1)); + + probe(btree, info->resume, path); + leafbuf = path[levels].buffer; + + /* leaf walk */ + while (1) { + if (delete_from_leaf(btree, bufdata(leafbuf), info)) + mark_buffer_dirty(leafbuf); + + /* try to merge this leaf with prev */ + if (leafprev) { + struct vleaf *this = bufdata(leafbuf); + struct vleaf *that = bufdata(leafprev); + trace_off("check leaf %p against %p", leafbuf, leafprev); + trace_off("need = %i, free = %i", (ops->dleaf_need)(btree, this), dleaf_free(sb, that)); + /* try to merge leaf with prev */ + if ((ops->leaf_need)(btree, this) <= (ops->leaf_free)(btree, that)) { + trace(">>> can merge leaf %p into leaf %p", leafbuf, leafprev); + (ops->leaf_merge)(btree, that, this); + remove_index(path, level); + mark_buffer_dirty(leafprev); + brelse_free(sb, leafbuf); + //dirty_buffer_count_check(sb); + goto keep_prev_leaf; + } + brelse(leafprev); + } + leafprev = leafbuf; +keep_prev_leaf: + + //nanosleep(&(struct timespec){ 0, 50 * 1000000 }, NULL); + //printf("time remaining: %Lx\n", deadline - gettime()); +// if (deadline && gettime() > deadline) +// suspend = -1; + if (info->blocks && info->freed >= info->blocks) + suspend = -1; + + /* pop and try to merge finished nodes */ + while (suspend || level_finished(path, level)) { + /* try to merge node with prev */ + if (prev[level].buffer) { + assert(level); /* node has no prev */ + struct bnode *this = path_node(path, level); + struct bnode *that = path_node(prev, level); + trace_off("check node %p against %p", this, that); + trace_off("this count = %i prev count = %i", bcount(this), bcount(that)); + /* try to merge with node to left */ + if (bcount(this) <= sb->entries_per_node - bcount(that)) { + trace(">>> can merge node %p into node %p", this, that); + merge_nodes(that, this); + remove_index(path, level - 1); + mark_buffer_dirty(prev[level].buffer); + brelse_free(sb, path[level].buffer); + //dirty_buffer_count_check(sb); + goto keep_prev_node; + } + brelse(prev[level].buffer); + } + prev[level].buffer = path[level].buffer; +keep_prev_node: + + /* deepest key in the path is the resume address */ + if (suspend == -1 && !level_finished(path, level)) { + suspend = 1; /* only set resume once */ + info->resume = from_be_u64((path[level].next)->key); + } + if (!level) { /* remove levels if possible */ + while (levels > 1 && bcount(path_node(prev, 0)) == 1) { + trace("drop btree level"); + btree->root.block = bufindex(prev[1].buffer); + brelse_free(sb, prev[0].buffer); + //dirty_buffer_count_check(sb); + levels = --btree->root.depth; + memcpy(prev, prev + 1, levels * sizeof(prev[0])); + //set_sb_dirty(sb); + } + brelse(leafprev); + release_path(prev, levels); + free_path(path); + free_path(prev); + //sb->snapmask &= ~snapmask; delete_snapshot_from_disk(); + //set_sb_dirty(sb); + //save_sb(sb); + return suspend; + } + level--; + trace_off(printf("pop to level %i, block %Lx, %i of %i nodes\n", level, path[level].buffer->index, path[level].next - path_node(path, level)->entries, bcount(path_node(path, level)));); + } + + /* push back down to leaf level */ + while (level < levels - 1) { + struct buffer_head *buffer = sb_bread(vfs_sb(sb), from_be_u64(path[level++].next++->block)); + if (!buffer) { + brelse(leafprev); + release_path(path, level - 1); + free_path(path); + free_path(prev); + return -ENOMEM; + } + path[level].buffer = buffer; + path[level].next = ((struct bnode *)bufdata(buffer))->entries; + trace_off(printf("push to level %i, block %Lx, %i nodes\n", level, bufindex(buffer), bcount(path_node(path, level)));); + }; + //dirty_buffer_count_check(sb); + /* go to next leaf */ + if (!(leafbuf = sb_bread(vfs_sb(sb), from_be_u64(path[level].next++->block)))) { + release_path(path, level); + free_path(path); + free_path(prev); + return -ENOMEM; + } + } +} + +/* Insertion */ + +static void add_child(struct bnode *node, struct index_entry *p, block_t child, u64 childkey) +{ + vecmove(p + 1, p, node->entries + bcount(node) - p); + p->block = to_be_u64(child); + p->key = to_be_u64(childkey); + node->count = to_be_u32(bcount(node) + 1); +} + +int insert_node(struct btree *btree, u64 childkey, block_t childblock, struct tux_path path[]) +{ + trace("insert node 0x%Lx key 0x%Lx into node 0x%Lx", (L)childblock, (L)childkey, (L)btree->root.block); + int levels = btree->root.depth; + while (levels--) { + struct index_entry *next = path[levels].next; + struct buffer_head *parentbuf = path[levels].buffer; + struct bnode *parent = bufdata(parentbuf); + + /* insert and exit if not full */ + if (bcount(parent) < btree->sb->entries_per_node) { + add_child(parent, next, childblock, childkey); + mark_buffer_dirty(parentbuf); + return 0; + } + + /* split a full index node */ + struct buffer_head *newbuf = new_node(btree); + if (!newbuf) + goto eek; + struct bnode *newnode = bufdata(newbuf); + unsigned half = bcount(parent) / 2; + u64 newkey = from_be_u64(parent->entries[half].key); + newnode->count = to_be_u32(bcount(parent) - half); + memcpy(&newnode->entries[0], &parent->entries[half], bcount(newnode) * sizeof(struct index_entry)); + parent->count = to_be_u32(half); + + /* if the path is in the new node, use that as the parent */ + if (next > parent->entries + half) { + next = next - &parent->entries[half] + newnode->entries; + mark_buffer_dirty(parentbuf); + parentbuf = newbuf; + parent = newnode; + } else + mark_buffer_dirty(newbuf); + add_child(parent, next, childblock, childkey); + mark_buffer_dirty(parentbuf); + childkey = newkey; + childblock = bufindex(newbuf); + brelse(newbuf); + } + trace("add tree level"); + struct buffer_head *newbuf = new_node(btree); + if (!newbuf) + goto eek; + struct bnode *newroot = bufdata(newbuf); + newroot->count = to_be_u32(2); + newroot->entries[0].block = to_be_u64(btree->root.block); + newroot->entries[1].key = to_be_u64(childkey); + newroot->entries[1].block = to_be_u64(childblock); + btree->root.block = bufindex(newbuf); + vecmove(path + 1, path, btree->root.depth++ + 1); + path[0] = (struct tux_path){ .buffer = newbuf }; // .next = ??? + //set_sb_dirty(sb); + mark_buffer_dirty(newbuf); + return 0; +eek: + release_path(path, levels + 1); + return -ENOMEM; +} + +int btree_leaf_split(struct btree *btree, struct tux_path path[], tuxkey_t key) +{ + trace("split leaf"); + struct buffer_head *leafbuf = path[btree->root.depth].buffer; + struct buffer_head *newbuf = new_leaf(btree); + if (!newbuf) { + /* the rule: release path at point of error */ + release_path(path, btree->root.depth); + return -ENOMEM; + } + u64 newkey = (btree->ops->leaf_split)(btree, key, bufdata(leafbuf), bufdata(newbuf)); + block_t childblock = bufindex(newbuf); + trace_off("use upper? %Li %Li", key, newkey); + if (key >= newkey) { + struct buffer_head *swap = leafbuf; + leafbuf = path[btree->root.depth].buffer = newbuf; + newbuf = swap; + } + brelse_dirty(newbuf); + return insert_node(btree, newkey, childblock, path); +} + +void *tree_expand(struct btree *btree, tuxkey_t key, unsigned newsize, struct tux_path path[]) +{ + for (int i = 0; i < 2; i++) { + struct buffer_head *leafbuf = path[btree->root.depth].buffer; + void *space = (btree->ops->leaf_resize)(btree, key, bufdata(leafbuf), newsize); + if (space) + return space; + assert(!i); + int err = btree_leaf_split(btree, path, key); + if (err) { + warn("insert_node failed (%d)", err); + break; + } + } + return NULL; +} + +struct btree new_btree(SB, struct btree_ops *ops) +{ + struct btree btree = { .sb = sb, .ops = ops }; + struct buffer_head *rootbuf = new_node(&btree); + struct buffer_head *leafbuf = new_leaf(&btree); + if (!rootbuf || !leafbuf) + goto eek; + struct bnode *root = bufdata(rootbuf); + root->entries[0].block = to_be_u64(bufindex(leafbuf)); + root->count = to_be_u32(1); + btree.root = (struct root){ .block = bufindex(rootbuf), .depth = 1 }; + printf("root at %Lx\n", (L)bufindex(rootbuf)); + printf("leaf at %Lx\n", (L)bufindex(leafbuf)); + brelse_dirty(rootbuf); + brelse_dirty(leafbuf); + return btree; +eek: + if (rootbuf) + brelse(rootbuf); + if (leafbuf) + brelse(leafbuf); + return (struct btree){ }; +} + +void free_btree(struct btree *btree) +{ + // write me +} diff --git a/fs/tux3/dir.c b/fs/tux3/dir.c new file mode 100644 index 0000000..9191d8a --- /dev/null +++ b/fs/tux3/dir.c @@ -0,0 +1,321 @@ +/* Lifted from Ext2, blush. GPLv2. Portions (c) Daniel Phillips 2008 */ + +/* + * linux/include/linux/ext2_fs.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * linux/fs/ext2/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext2 directory handling functions + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * All code that works with directory layout had been switched to pagecache + * and moved here. AV + */ + +#include "tux3.h" + +#define TUX_DIR_PAD 3 +#define TUX_REC_LEN(name_len) (((name_len) + 8 + TUX_DIR_PAD) & ~TUX_DIR_PAD) +#define TUX_MAX_REC_LEN ((1<<16)-1) + +static inline unsigned tux_rec_len_from_disk(be_u16 dlen) +{ + unsigned len = from_be_u16(dlen); + if (len == TUX_MAX_REC_LEN) + return 1 << 16; + return len; +} + +static inline be_u16 tux_rec_len_to_disk(unsigned len) +{ + if (len == (1 << 16)) + return to_be_u16(TUX_MAX_REC_LEN); + else if (len > (1 << 16)) + error("oops"); + return to_be_u16(len); +} + +static inline int is_deleted(tux_dirent *entry) +{ + return !entry->name_len; /* ext2 uses !inum for this */ +} + +static inline int tux_match(tux_dirent *entry, const char *const name, int len) +{ + if (len != entry->name_len) + return 0; + if (is_deleted(entry)) + return 0; + return !memcmp(name, entry->name, len); +} + +static inline tux_dirent *next_entry(tux_dirent *entry) +{ + return (tux_dirent *)((char *)entry + tux_rec_len_from_disk(entry->rec_len)); +} + +enum { + TUX_UNKNOWN, + TUX_REG, + TUX_DIR, + TUX_CHR, + TUX_BLK, + TUX_FIFO, + TUX_SOCK, + TUX_LNK, + TUX_TYPES +}; + +#define STAT_SHIFT 12 + +static unsigned char tux_type_by_mode[S_IFMT >> STAT_SHIFT] = { + [S_IFREG >> STAT_SHIFT] = TUX_REG, + [S_IFDIR >> STAT_SHIFT] = TUX_DIR, + [S_IFCHR >> STAT_SHIFT] = TUX_CHR, + [S_IFBLK >> STAT_SHIFT] = TUX_BLK, + [S_IFIFO >> STAT_SHIFT] = TUX_FIFO, + [S_IFSOCK >> STAT_SHIFT] = TUX_SOCK, + [S_IFLNK >> STAT_SHIFT] = TUX_LNK, +}; + +loff_t tux_create_entry(struct inode *dir, const char *name, int len, unsigned inum, unsigned mode) +{ + tux_dirent *entry; + struct buffer_head *buffer; + unsigned reclen = TUX_REC_LEN(len), rec_len, name_len, offset; + unsigned blockbits = tux_sb(dir->i_sb)->blockbits, blocksize = 1 << blockbits; + unsigned blocks = dir->i_size >> blockbits, block; + for (block = 0; block < blocks; block++) { + buffer = blockget(mapping(dir), block); + entry = bufdata(buffer); + tux_dirent *limit = bufdata(buffer) + blocksize - reclen; + while (entry <= limit) { + if (entry->rec_len == 0) { + warn("zero-length directory entry"); + brelse(buffer); + return -1; + } + name_len = TUX_REC_LEN(entry->name_len); + rec_len = tux_rec_len_from_disk(entry->rec_len); + if (is_deleted(entry) && rec_len >= reclen) + goto create; + if (rec_len >= name_len + reclen) + goto create; + entry = (tux_dirent *)((char *)entry + rec_len); + } + brelse(buffer); + } + buffer = blockget(mapping(dir), block = blocks); + entry = bufdata(buffer); + name_len = 0; + rec_len = blocksize; + *entry = (tux_dirent){ .rec_len = tux_rec_len_to_disk(blocksize) }; + dir->i_size += blocksize; +create: + if (!is_deleted(entry)) { + tux_dirent *newent = (tux_dirent *)((char *)entry + name_len); + newent->rec_len = tux_rec_len_to_disk(rec_len - name_len); + entry->rec_len = tux_rec_len_to_disk(name_len); + entry = newent; + } + entry->name_len = len; + memcpy(entry->name, name, len); + entry->inum = to_be_u32(inum); + entry->type = tux_type_by_mode[(mode & S_IFMT) >> STAT_SHIFT]; + dir->i_mtime = dir->i_ctime = gettime(); + mark_inode_dirty(dir); + offset = (void *)entry - bufdata(buffer); + brelse_dirty(buffer); + return (block << blockbits) + offset; +} + +tux_dirent *tux_find_entry(struct inode *dir, const char *name, int len, struct buffer_head **result) +{ + unsigned reclen = TUX_REC_LEN(len); + unsigned blocksize = 1 << tux_sb(dir->i_sb)->blockbits; + unsigned blocks = dir->i_size >> tux_sb(dir->i_sb)->blockbits, block; + for (block = 0; block < blocks; block++) { + struct buffer_head *buffer = blockread(mapping(dir), block); + tux_dirent *entry = bufdata(buffer); + tux_dirent *limit = (void *)entry + blocksize - reclen; + while (entry <= limit) { + if (entry->rec_len == 0) { + brelse(buffer); + warn("zero length entry at <%Lx:%x>", (L)tux_inode(dir)->inum, block); + return NULL; + } + if (tux_match(entry, name, len)) { + *result = buffer; + return entry; + } + entry = next_entry(entry); + } + brelse(buffer); + } + *result = NULL; + return NULL; +} + +static unsigned char filetype[TUX_TYPES] = { + [TUX_UNKNOWN] = DT_UNKNOWN, + [TUX_REG] = DT_REG, + [TUX_DIR] = DT_DIR, + [TUX_CHR] = DT_CHR, + [TUX_BLK] = DT_BLK, + [TUX_FIFO] = DT_FIFO, + [TUX_SOCK] = DT_SOCK, + [TUX_LNK] = DT_LNK, +}; + +static int tux_readdir(struct file *file, void *state, filldir_t filldir) +{ + loff_t pos = file->f_pos; +#ifdef __KERNEL__ + struct inode *dir = file->f_dentry->d_inode; +#else + struct inode *dir = file->f_inode; +#endif + int revalidate = file->f_version != dir->i_version; + unsigned blockbits = tux_sb(dir->i_sb)->blockbits; + unsigned blocksize = 1 << blockbits; + unsigned blockmask = blocksize - 1; + unsigned blocks = dir->i_size >> blockbits; + unsigned offset = pos & blockmask; + for (unsigned block = pos >> blockbits ; block < blocks; block++) { + struct buffer_head *buffer = blockread(mapping(dir), block); + void *base = bufdata(buffer); + if (!buffer) + return -EIO; + if (revalidate) { + if (offset) { + tux_dirent *entry = base + offset; + tux_dirent *p = base + (offset & blockmask); + while (p < entry && p->rec_len) + p = next_entry(p); + offset = (void *)p - base; + file->f_pos = (block << blockbits) + offset; + } + file->f_version = dir->i_version; + revalidate = 0; + } + unsigned size = dir->i_size - (block << blockbits); + tux_dirent *limit = base + (size > blocksize ? blocksize : size) - TUX_REC_LEN(1); + for (tux_dirent *entry = base + offset; entry <= limit; entry = next_entry(entry)) { + if (entry->rec_len == 0) { + brelse(buffer); + warn("zero length entry at <%Lx:%x>", (L)tux_inode(dir)->inum, block); + return -EIO; + } + if (!is_deleted(entry)) { + unsigned type = (entry->type < TUX_TYPES) ? filetype[entry->type] : DT_UNKNOWN; + int lame = filldir( + state, entry->name, entry->name_len, + (block << blockbits) | ((void *)entry - base), + from_be_u32(entry->inum), type); + if (lame) { + brelse(buffer); + return 0; + } + } + file->f_pos += tux_rec_len_from_disk(entry->rec_len); + } + brelse(buffer); + offset = 0; + } + return 0; +} + +int tux_delete_entry(struct buffer_head *buffer, tux_dirent *entry) +{ + tux_dirent *prev = NULL, *this = bufdata(buffer); + while ((char *)this < (char *)entry) { + if (this->rec_len == 0) { + warn("zero-length directory entry"); + brelse(buffer); + return -EIO; + } + prev = this; + this = next_entry(this); + } + if (prev) + prev->rec_len = tux_rec_len_to_disk((void *)entry + + tux_rec_len_from_disk(entry->rec_len) - (void *)prev); + memset(entry->name, 0, entry->name_len); + entry->name_len = entry->type = 0; + entry->inum = to_be_u32(0); + brelse(buffer); + return 0; +} + +#ifdef __KERNEL__ +static struct dentry *tux_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct buffer_head *buffer; + struct inode *inode = NULL; + tux_dirent *dirent; + + dirent = tux_find_entry(dir, dentry->d_name.name, dentry->d_name.len, + &buffer); + if (dirent) { + inode = tux3_iget(dir->i_sb, from_be_u32(dirent->inum)); + brelse(buffer); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + return d_splice_alias(inode, dentry); +} + +const struct file_operations tux_dir_fops = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = tux_readdir, +}; + +const struct inode_operations tux_dir_iops = { +// .create = ext3_create, + .lookup = tux_lookup, +// .link = ext3_link, +// .unlink = ext3_unlink, +// .symlink = ext3_symlink, +// .mkdir = ext3_mkdir, +// .rmdir = ext3_rmdir, +// .mknod = ext3_mknod, +// .rename = ext3_rename, +// .setattr = ext3_setattr, +// .setxattr = generic_setxattr, +// .getxattr = generic_getxattr, +// .listxattr = ext3_listxattr, +// .removexattr = generic_removexattr, +// .permission = ext3_permission, + /* FIXME: why doesn't ext4 support this for directory? */ +// .fallocate = ext4_fallocate, +// .fiemap = ext4_fiemap, +}; +#endif /* !__KERNEL__ */ diff --git a/fs/tux3/dleaf.c b/fs/tux3/dleaf.c new file mode 100644 index 0000000..b8cec81 --- /dev/null +++ b/fs/tux3/dleaf.c @@ -0,0 +1,548 @@ +/* + * File index btree leaf operations + * + * Original copyright (c) 2008 Daniel Phillips + * Licensed under the GPL version 3 + * + * By contributing changes to this file you grant the original copyright holder + * the right to distribute those changes under any license. + */ + +#include "tux3.h" + +#ifndef trace +#define trace trace_on +#endif + +/* + * Leaf index format + * + * A leaf has a small header followed by a table of extents. A two level + * index grows down from the top of the leaf towards the top of the extent + * table. The index maps each unique logical address in the leaf to one or + * more extents beginning at that address. + * + * The top level index is a table of groups of entries all having the same + * high 24 bits of logical address which is only stored once, along with the + * 8 bit count of entries in the group. Since there can be more than 256 + * entries at the same logical address, there could be more than one group + * with the same logical address. The group count is used both to know the + * number of entries in the group and to find the beginning of the entry table + * for a given group, by adding up the sizes of the proceeding groups. + * + * The 8 bit entry limit limits the number of different versions at the same + * logical address to 255. For now. + * + * The second level entry tables are stored end to end in reverse immediately + * below the groups table, also stored in reverse. Each entry has the low 24 + * bits of the logical address and the 8 bit 'limit' offset of the last extent + * for that logical address, measuring from the first extent for the group in + * units of extent size. The limit is used rather than an offset so that the + * final offset is the count of extents in the group, which is summed up to + * locate the first extent for the group in the extent table. The difference + * between and entry limit and the limit of its predecessor gives the count of + * extents for the logical address specified by the entry. + * + * At the top level of a very large or very sparse btree it is likely that the + * group table will be relatively larger, up to the same size as all the entry + * tables. This does not matter much in terms of overall btree bulk. A few + * levels down the logical address space will have been split to the point + * where most entries in a leaf fit into one entry table. + * + * This leaf indexing scheme has some obscure boundary conditions, such as + * the zeroth entry of a group having no predecessor and thus needing to have + * a special check to supply zero as the preceding limit. Inserting and + * deleting are fairly involved and subtle. But the space required to index + * extents in a deep btree is reduced considerably, which is compelling. In + * the end, the indexing scheme provides access to a simple linear table of + * extents and a count, so there is little impact on the specialized methods + * that operate on those extents due to the complexity of the indexing scheme. + * The lookup operation on this index is very efficient. Each level of the + * index is suited to binary search. A sequence of inserts in ascending order + * in the same group requires no existing entries to be relocated, the reason + * the entry list is stored in reverse. + */ + +static inline struct dleaf *to_dleaf(vleaf *leaf) +{ + return leaf; +} + +int dleaf_init(BTREE, vleaf *leaf) +{ + if (!leaf) + return -1; + *to_dleaf(leaf) = (struct dleaf){ + .magic = to_be_u16(0x1eaf), + .free = to_be_u16(sizeof(struct dleaf)), + .used = to_be_u16(btree->sb->blocksize) }; + return 0; +} + +int dleaf_sniff(BTREE, vleaf *leaf) +{ + return from_be_u16(to_dleaf(leaf)->magic) == 0x1eaf; +} + +unsigned dleaf_free(BTREE, vleaf *leaf) +{ + return from_be_u16(to_dleaf(leaf)->used) - from_be_u16(to_dleaf(leaf)->free); +} + +unsigned dleaf_need(BTREE, struct dleaf *leaf) +{ + return btree->sb->blocksize - dleaf_free(btree, leaf) - sizeof(struct dleaf); +} + +int dleaf_free2(BTREE, void *vleaf) +{ + struct dleaf *leaf = vleaf; + struct group *gdict = (void *)leaf + btree->sb->blocksize, *gstop = gdict - dleaf_groups(leaf); + struct entry *edict = (void *)gstop, *entry = edict; + struct extent *extents = leaf->table; + for (struct group *group = gdict; group-- > gstop;) + extents += entry_limit(entry -= group_count(group)); + return (void *)entry - (void *)extents; +} + +void dleaf_dump(BTREE, vleaf *vleaf) +{ + unsigned blocksize = btree->sb->blocksize; + struct dleaf *leaf = vleaf; + struct group *groups = (void *)leaf + blocksize, *grbase = --groups - dleaf_groups(leaf); + struct entry *entries = (void *)(grbase + 1), *entry = entries; + struct extent *extents = leaf->table; + + printf("%i entry groups:\n", dleaf_groups(leaf)); + for (struct group *group = groups; group > grbase; group--) { + printf(" %ti/%i:", groups - group, group_count(group)); + //printf(" [%i]", extents - leaf->table); + struct entry *enbase = entry - group_count(group); + while (entry > enbase) { + --entry; + unsigned offset = entry == entries - 1 ? 0 : entry_limit(entry + 1); + int count = entry_limit(entry) - offset; + printf(" %Lx =>", ((L)group_keyhi(group) << 24) + entry_keylo(entry)); + //printf(" %p (%i)", entry, entry_limit(entry)); + if (count < 0) + printf(" "); + else for (int i = 0; i < count; i++) { + struct extent extent = extents[offset + i]; + printf(" %Lx", (L)extent_block(extent)); + if (extent_count(extent)) + printf("/%x", extent_count(extent)); + } + //printf(" {%u}", entry_limit(entry)); + printf(";"); + } + printf("\n"); + entries -= group_count(group); + extents += entry_limit(entry); + } +} + +/* + * Reasons this dleaf truncator sucks: + * + * * Does not check for integrity at all so a corrupted leaf can cause overflow + * and system corruption. + * + * * Assumes all block pointers after the truncation point will be deleted, + * which does not hold when versions arrive. + * + * * Modifies a group count in the middle of the traversal knowing that it has + * already loaded the changed field and will not load it again, fragile. + * + * * Does not provide a generic mechanism that can be adapted to other + * truncation tasks. + * + * But it does truncate so it is getting checked in just for now. + */ + +int dleaf_chop(BTREE, tuxkey_t chop, vleaf *vleaf) +{ + struct dleaf *leaf = vleaf; + struct group *gdict = (void *)leaf + btree->sb->blocksize, *group = gdict; + struct entry *entry = (void *)(--group - dleaf_groups(leaf)); + struct group *gstop = group - dleaf_groups(leaf); + struct entry *estop = entry - group_count(group); + unsigned extents = 0, start = 0, trunc = 0; + unsigned newgroups = dleaf_groups(leaf); + + if (!newgroups) + return 0; + + while (1) { + unsigned count = entry_limit(entry) - start; + tuxkey_t key = ((tuxkey_t)group_keyhi(group) << 24) | entry_keylo(entry); + if (key >= chop) { + if (!trunc) { + int removed = entry - estop, remaining = group_count(group) - removed; + newgroups = gdict - group - !remaining; + inc_group_count(group, - removed); + trunc = 1; + } + for (int i = 0; i < count; i++) + (btree->ops->bfree)(btree->sb, extent_block(leaf->table[extents + i])); + } + start = entry_limit(entry); + extents += count; + if (--entry != estop) + continue; + if (--group == gstop) + break; + estop = entry - group_count(group); + start = 0; + } + unsigned tamp = (dleaf_groups(leaf) - newgroups) * sizeof(struct group); + unsigned tail = (void *)(gdict - newgroups) - ((void *)entry + tamp); + memmove((void *)entry + tamp, entry, tail); + set_dleaf_groups(leaf, newgroups); + return 0; +} + +int dwalk_probe(struct dleaf *leaf, unsigned blocksize, struct dwalk *walk, tuxkey_t key) +{ + trace("probe for 0x%Lx", (L)key); + unsigned keylo = key & 0xffffff, keyhi = key >> 24; + struct group *gdict = (void *)leaf + blocksize; + struct entry *edict = (struct entry *)(gdict - dleaf_groups(leaf)); + struct group *gstop = gdict - dleaf_groups(leaf), *group = gdict; + struct entry *estop = edict, *entry; + struct extent *exbase = leaf->table; + + if (dleaf_groups(leaf)) + while (--group >= gstop) { + trace_off("group %i check %x = %x", gdict - group - 1, keyhi, group_keyhi(group)); + estop -= group_count(group); + if (group_keyhi(group) > keyhi) + break; + trace_off("next group keylow = %x", entry_keylo(estop - 1)); + if (group_keyhi(group) == keyhi) { + if (group == gstop) + break; + if (group_keyhi(group - 1) != keyhi) + break; + if (entry_keylo(estop - 1) > keylo) + break; + } + exbase += entry_limit(estop); + } + + struct extent *extent = exbase, *exstop = exbase; + //trace("group %i entry %i of %i", gdict - 1 - group, estop + group_count(group) - 1 - entry, group_count(group)); + if (!dleaf_groups(leaf) || group < gstop) + entry = estop; + else { + assert(group_keyhi(group) >= keyhi); + entry = estop + group_count(group); + //trace("entry %x, estop %x", entry_keylo(entry), entry_keylo(estop)); + if (group_keyhi(group) == keyhi) { + while (entry > estop) { + --entry; + trace_off("entry check %x, %x", keylo, entry_keylo(entry - 1)); + exstop = exbase + entry_limit(entry); + if (entry_keylo(entry) >= keylo) + break; + extent = exstop; + } + } + } + + trace_off("group %i entry %i of %i", gdict - 1 - group, estop + group_count(group) - 1 - entry, group_count(group)); + trace("extent = %tx, exstop = %tx", extent - leaf->table, exstop - leaf->table); + *walk = (struct dwalk){ + .leaf = leaf, .gdict = gdict, + .group = group, .gstop = gstop, + .entry = entry, .estop = estop, + .extent = extent, .exstop = exstop, + .exbase = exbase }; + return 0; +} + +tuxkey_t dwalk_index(struct dwalk *walk) +{ + return (group_keyhi(walk->group) << 24) | entry_keylo(walk->entry); +} + +struct extent *dwalk_next(struct dwalk *walk) +{ + if (!dleaf_groups(walk->leaf)) + return NULL; + trace("walk extent = %tx, exstop = %tx", walk->extent - walk->leaf->table, walk->exstop - walk->leaf->table); + if (walk->extent >= walk->exstop) { + trace("at entry %ti/%i", walk->estop + group_count(walk->group) - 1 - walk->entry, group_count(walk->group)); + if (walk->entry <= walk->estop) { + trace("next group, end = %i", walk->group <= walk->gstop); + if (walk->group <= walk->gstop) + return NULL; + walk->exbase += entry_limit(walk->estop); + trace("exbase => %Lx", (L)extent_block(*walk->exbase)); + trace("extent => %Lx", (L)extent_block(*walk->extent)); + walk->estop -= group_count(--walk->group); + } + walk->entry--; + walk->exstop = walk->exbase + entry_limit(walk->entry); + } + trace("next extent 0x%Lx => %Lx/%x", (L)dwalk_index(walk), (L)extent_block(*walk->extent), extent_count(*walk->extent)); + trace("walk extent = %tx, exstop = %tx", walk->extent - walk->leaf->table, walk->exstop - walk->leaf->table); + trace("at entry %ti/%i", walk->estop + group_count(walk->group) - 1 - walk->entry, group_count(walk->group)); + return walk->extent++; // also return key +} + +void dwalk_back(struct dwalk *walk) +{ + trace("back one entry"); + if (++walk->entry == walk->estop + group_count(walk->group)) { + trace("back one group"); + if (++walk->group == walk->gdict) { + trace("at start"); + --walk->group; + walk->exstop = walk->extent = walk->exbase = walk->leaf->table; + return; + } + walk->exbase -= entry_limit(walk->entry); + walk->estop = walk->entry; + trace("exbase => %Lx", (L)extent_block(*walk->exbase)); + trace("entry offset = %ti", walk->estop + group_count(walk->group) - 1 - walk->entry); + } + walk->extent = walk->exbase + (walk->estop + group_count(walk->group) - 1 - walk->entry); + walk->exstop = walk->exbase + entry_limit(walk->entry); + trace("exstop => %Lx", (L)extent_block(*walk->exstop)); +} + +void dwalk_chop_after(struct dwalk *walk) +{ + struct dleaf *leaf = walk->leaf; + struct group *gdict = walk->gdict; + struct entry *ebase = walk->estop + group_count(walk->group); + struct entry *entry = walk->entry; + unsigned newgroups = walk->gdict - walk->group; + set_group_count(walk->group, ebase - entry); + trace_on("%i groups, %i entries in last", dleaf_groups(leaf), group_count(walk->group)); + void *free = (void *)entry + (dleaf_groups(leaf) - newgroups) * sizeof(*gdict); + memmove(free, entry, (void *)(gdict - newgroups) - free); + walk->estop = walk->entry = free; + walk->gstop = walk->group; + set_dleaf_groups(leaf, newgroups); +} + +void dwalk_chop(struct dwalk *walk) // do we ever need this? +{ + if (!dleaf_groups(walk->leaf)) { + trace("<<<<<<<<<<<<< dleaf empty"); + return; + } + if (walk->group + 1 == walk->gdict && walk->entry + 1 == walk->estop + group_count(walk->group)) { + trace(">>>>>>>>>>>>> empty dleaf"); + set_dleaf_groups(walk->leaf, 0); + return; + } + dwalk_back(walk); + dwalk_chop_after(walk); +} + +#if defined(main) || defined(__KERNEL__) +#define MAX_GROUP_ENTRIES 255 +#else +#define MAX_GROUP_ENTRIES 7 +#endif + +int dwalk_mock(struct dwalk *walk, tuxkey_t index, struct extent extent) +{ + if (!dleaf_groups(walk->leaf) || walk->entry == walk->estop || dwalk_index(walk) != index) { + trace("add entry 0x%Lx", (L)index); + unsigned keylo = index & 0xffffff, keyhi = index >> 24; + if (!walk->mock.groups || group_keyhi(&walk->mock.group) != keyhi || group_count(&walk->mock.group) >= MAX_GROUP_ENTRIES) { + trace("add group %i", walk->mock.groups); + walk->exbase += entry_limit(&walk->mock.entry); + walk->mock.group = make_group(keyhi, 0); + walk->mock.used -= sizeof(struct group); + walk->mock.groups++; + } + walk->mock.used -= sizeof(struct entry); + walk->mock.entry = make_entry(keylo, walk->extent - walk->exbase); + inc_group_count(&walk->mock.group, 1); + } + trace("add extent 0x%Lx => 0x%Lx/%x", (L)index, (L)extent_block(extent), extent_count(extent)); + walk->mock.free += sizeof(*walk->extent); + walk->extent++; + inc_entry_limit(&walk->mock.entry, 1); + return 0; +} + +int dwalk_pack(struct dwalk *walk, tuxkey_t index, struct extent extent) +{ + trace("group %ti/%i", walk->gstop + dleaf_groups(walk->leaf) - 1 - walk->group, dleaf_groups(walk->leaf)); + //printf("at entry %ti/%i\n", walk->estop + group_count(walk->group) - 1 - walk->entry, group_count(walk->group)); + if (!dleaf_groups(walk->leaf) || walk->entry == walk->estop || dwalk_index(walk) != index) { + trace("add entry 0x%Lx", (L)index); + unsigned keylo = index & 0xffffff, keyhi = index >> 24; + if (!dleaf_groups(walk->leaf) || group_keyhi(walk->group) != keyhi || group_count(walk->group) >= MAX_GROUP_ENTRIES) { + trace("add group %i", dleaf_groups(walk->leaf)); + /* will it fit? */ + assert(sizeof(struct entry) == sizeof(struct group)); + assert(from_be_u16(walk->leaf->free) <= from_be_u16(walk->leaf->used) - sizeof(*walk->entry)); + /* move entries down, adjust walk state */ + /* could preplan this to avoid move: need additional pack state */ + vecmove(walk->entry - 1, walk->entry, (struct entry *)walk->group - walk->entry); + walk->entry--; /* adjust to moved position */ + walk->exbase += dleaf_groups(walk->leaf) ? entry_limit(walk->entry) : 0; + *--walk->group = make_group(keyhi, 0); + walk->leaf->used = to_be_u16(from_be_u16(walk->leaf->used) - sizeof(struct group)); + inc_dleaf_groups(walk->leaf, 1); + } + assert(from_be_u16(walk->leaf->free) <= from_be_u16(walk->leaf->used) - sizeof(*walk->entry)); + walk->leaf->used = to_be_u16(from_be_u16(walk->leaf->used) - sizeof(struct entry)); + *--walk->entry = make_entry(keylo, walk->extent - walk->exbase); + inc_group_count(walk->group, 1); + } + trace("add extent %ti", walk->extent - walk->leaf->table); + //trace("add extent 0x%Lx => 0x%Lx/%x", (L)index, (L)extent.block, extent_count(extent)); + assert(from_be_u16(walk->leaf->free) + sizeof(*walk->extent) <= from_be_u16(walk->leaf->used)); + walk->leaf->free = to_be_u16(from_be_u16(walk->leaf->free) + sizeof(*walk->extent)); + *walk->extent++ = extent; + inc_entry_limit(walk->entry, 1); + return 0; // extent out of order??? leaf full??? +} + +int dleaf_check(BTREE, struct dleaf *leaf) +{ + struct group *gdict = (void *)leaf + btree->sb->blocksize, *gstop = gdict - dleaf_groups(leaf); + struct entry *edict = (void *)gstop, *entry = edict; + struct extent *extents = leaf->table; + unsigned excount = 0, encount = 0; + char *why; + + for (struct group *group = gdict - 1; group >= gstop; group--) { + entry -= group_count(group); + excount += entry_limit(entry); + encount += group_count(group); + } + //printf("encount = %i, excount = %i, \n", encount, excount); + why = "used count wrong"; + if (from_be_u16(leaf->used) != (void *)(edict - encount) - (void *)leaf) + goto eek; + why = "free count wrong"; + if (from_be_u16(leaf->free) != (void *)(extents + excount) - (void *)leaf) + goto eek; + why = "free check mismatch"; + if (from_be_u16(leaf->used) - from_be_u16(leaf->free) != dleaf_free2(btree, leaf)) + goto eek; + return 0; +eek: + printf("free %i, used %i\n", from_be_u16(leaf->free), from_be_u16(leaf->used)); + printf("%s!\n", why); + return -1; +} + +tuxkey_t dleaf_split(BTREE, tuxkey_t key, vleaf *from, vleaf *into) +{ + assert(dleaf_sniff(btree, from)); + struct dleaf *leaf = from, *dest = into; + struct group *groups = from + btree->sb->blocksize, *grbase = groups - dleaf_groups(leaf); + struct entry *entries = (void *)grbase; + printf("split %p into %p\n", leaf, dest); + unsigned encount = 0, recount = 0, grsplit = 0, exsplit = 0; + + /* find middle in terms of entries - may be unbalanced in extents */ + for (struct group *group = groups - 1; group >= grbase; group--) + encount += group_count(group); + unsigned split = encount / 2; + for (struct group *group = groups - 1; group >= grbase; group--, grsplit++) { + if (recount + group_count(group) > split) + break; + entries -= group_count(group); + exsplit += entry_limit(entries); + recount += group_count(group); + } + + /* have to split a group? */ + unsigned cut = split - recount; + if (cut) + exsplit += entry_limit(entries - cut); + entries = (void *)grbase; /* restore it */ + printf("split %i entries at group %i, entry %x\n", encount, grsplit, cut); + printf("split extents at %i\n", exsplit); + /* copy extents */ + unsigned size = from + from_be_u16(leaf->free) - (void *)(leaf->table + exsplit); + memcpy(dest->table, leaf->table + exsplit, size); + + /* copy groups */ + struct group *destgroups = (void *)dest + btree->sb->blocksize; + set_dleaf_groups(dest, dleaf_groups(leaf) - grsplit); + veccopy(destgroups - dleaf_groups(dest), grbase, dleaf_groups(dest)); + inc_group_count(destgroups - 1, -cut); + set_dleaf_groups(leaf, grsplit + !!cut); + grbase = groups - dleaf_groups(leaf); + if (cut) + set_group_count(groups - dleaf_groups(leaf), cut); + + /* copy entries */ + struct entry *destentries = (void *)(destgroups - dleaf_groups(dest)); + struct entry *enbase = entries - encount; + unsigned encopy = encount - split; + veccopy(destentries - encopy, enbase, encopy); + if (cut) + for (int i = 1; i <= group_count((destgroups - 1)); i++) + inc_entry_limit(destentries - i, -entry_limit(entries - split)); + vecmove(groups - dleaf_groups(leaf) - split, entries - split, split); + + /* clean up */ + leaf->free = to_be_u16((void *)(leaf->table + exsplit) - from); + dest->free = to_be_u16((void *)leaf->table + size - from); + leaf->used = to_be_u16((void *)(grbase - split) - from); + dest->used = to_be_u16((void *)(groups - dleaf_groups(dest) - encount + split) - from); + memset(from + from_be_u16(leaf->free), 0, from_be_u16(leaf->used) - from_be_u16(leaf->free)); + return (group_keyhi(destgroups - 1) << 24) | entry_keylo(destentries - 1); +} + +void dleaf_merge(BTREE, struct dleaf *leaf, struct dleaf *from) +{ + struct group *groups = (void *)leaf + btree->sb->blocksize, *grbase = groups - dleaf_groups(leaf); + struct entry *entries = (void *)grbase; + printf("merge %p into %p\n", from, leaf); + //assert(dleaf_need(from) <= dleaf_free(leaf)); + + /* append extents */ + unsigned size = from_be_u16(from->free) - sizeof(struct dleaf); + memcpy((void *)leaf + from_be_u16(leaf->free), from->table, size); + leaf->free = to_be_u16(from_be_u16(leaf->free) + size); + + /* merge last group (lowest) with first of from (highest)? */ + struct group *fromgroups = (void *)from + btree->sb->blocksize; + int uncut = dleaf_groups(leaf) && dleaf_groups(from) && (group_keyhi(fromgroups - 1) == group_keyhi(grbase)); + + /* make space and append groups except for possibly merged group */ + unsigned addgroups = dleaf_groups(from) - uncut; + struct group *grfrom = fromgroups - dleaf_groups(from); + struct entry *enfrom = (void *)from + from_be_u16(from->used); + struct entry *enbase = (void *)leaf + from_be_u16(leaf->used); + vecmove(enbase - addgroups, enbase, entries - enbase); + veccopy(grbase -= addgroups, grfrom, addgroups); + enbase -= addgroups; + if (uncut) + inc_group_count(grbase + addgroups, group_count(fromgroups - 1)); + inc_dleaf_groups(leaf, addgroups); + + /* append entries */ + size = (void *)grfrom - (void *)enfrom; + memcpy((void *)enbase - size, enfrom, size); + leaf->used = to_be_u16((void *)enbase - size - (void *)leaf); + + /* adjust entry limits for merged group */ + if (uncut) + for (int i = 1; i <= group_count((fromgroups - 1)); i++) + inc_entry_limit(enbase - i, entry_limit(enbase)); +} + +struct btree_ops dtree_ops = { + .leaf_sniff = dleaf_sniff, + .leaf_init = dleaf_init, + .leaf_dump = dleaf_dump, + .leaf_split = dleaf_split, +// .leaf_resize = dleaf_resize, + .leaf_chop = dleaf_chop, + .balloc = balloc, + .bfree = bfree, +}; diff --git a/fs/tux3/filemap.c b/fs/tux3/filemap.c new file mode 100644 index 0000000..9412b96 --- /dev/null +++ b/fs/tux3/filemap.c @@ -0,0 +1,485 @@ +#ifdef __KERNEL__ +#include +#include +#include +#endif +#include "tux3.h" + +#ifndef trace +#define trace trace_on +#endif + +#ifndef __KERNEL__ +/* + * Extrapolate from single buffer flush or blockread to opportunistic exent IO + * + * For write, try to include adjoining buffers above and below: + * - stop at first uncached or clean buffer in either direction + * + * For read (essentially readahead): + * - stop at first present buffer + * - stop at end of file + * + * For both, stop when extent is "big enough", whatever that means. + */ +void guess_extent(struct buffer_head *buffer, block_t *start, block_t *limit, int write) +{ + struct inode *inode = buffer_inode(buffer); + block_t ends[2] = { bufindex(buffer), bufindex(buffer) }; + for (int up = !write; up < 2; up++) { + while (ends[1] - ends[0] + 1 < MAX_EXTENT) { + block_t next = ends[up] + (up ? 1 : -1); + struct buffer_head *nextbuf = peekblk(buffer->map, next); + if (!nextbuf) { + if (write) + break; + if (next > inode->i_size >> tux_sb(inode->i_sb)->blockbits) + break; + } else { + unsigned stop = write ? !buffer_dirty(nextbuf) : buffer_empty(nextbuf); + brelse(nextbuf); + if (stop) + break; + } + ends[up] = next; /* what happens to the beer you send */ + } + } + *start = ends[0]; + *limit = ends[1] + 1; +} + +int filemap_extent_io(struct buffer_head *buffer, int write) +{ + struct inode *inode = buffer_inode(buffer); + struct sb *sb = tux_sb(inode->i_sb); + trace("%s inode 0x%Lx block 0x%Lx", write ? "write" : "read", (L)tux_inode(inode)->inum, (L)bufindex(buffer)); + if (bufindex(buffer) & (-1LL << MAX_BLOCKS_BITS)) + return -EIO; + struct dev *dev = sb->devmap->dev; + assert(dev->bits >= 8 && dev->fd); + int levels = tux_inode(inode)->btree.root.depth, try = 0, i, err; + if (!levels) { + if (!write) { + trace("unmapped block %Lx", (L)bufindex(buffer)); + memset(bufdata(buffer), 0, sb->blocksize); + set_buffer_uptodate(buffer); + return 0; + } + return -EIO; + } + if (write && buffer_empty(buffer)) + warn("egad, writing an invalid buffer"); + if (!write && buffer_dirty(buffer)) + warn("egad, reading a dirty buffer"); + + block_t start, limit; + guess_extent(buffer, &start, &limit, write); + printf("---- extent 0x%Lx/%Lx ----\n", (L)start, (L)limit - start); + struct extent seg[1000]; + struct tux_path *path = alloc_path(levels + 1); + if (!path) + return -ENOMEM; + + if ((err = probe(&tux_inode(inode)->btree, start, path))) { + free_path(path); + return err; + } +retry: + //assert(start >= this_key(path, levels)) + /* do not overlap next leaf */ + if (limit > next_key(path, levels)) + limit = next_key(path, levels); + unsigned segs = 0; + struct dleaf *leaf = bufdata(path[levels].buffer); + struct dwalk *walk = &(struct dwalk){ }; +dleaf_dump(&tux_inode(inode)->btree, leaf); + /* Probe below io start to include overlapping extents */ + dwalk_probe(leaf, sb->blocksize, walk, 0); // start at beginning of leaf just for now + + /* skip extents below start */ + for (struct extent *extent; (extent = dwalk_next(walk));) + if (dwalk_index(walk) + extent_count(*extent) > start) { + if (dwalk_index(walk) <= start) + dwalk_back(walk); + break; + } + struct dwalk rewind = *walk; + printf("prior extents:"); + for (struct extent *extent; (extent = dwalk_next(walk));) + printf(" 0x%Lx => %Lx/%x;", (L)dwalk_index(walk), (L)extent_block(*extent), extent_count(*extent)); + printf("\n"); + + if (dleaf_groups(leaf)) + printf("---- rewind to 0x%Lx => %Lx/%x ----\n", (L)dwalk_index(&rewind), (L)extent_block(*rewind.extent), extent_count(*rewind.extent)); + *walk = rewind; + + struct extent *next_extent = NULL; + block_t index = start, offset = 0; + while (index < limit) { + trace("index %Lx, limit %Lx", (L)index, (L)limit); + if (next_extent) { + trace("emit %Lx/%x", (L)extent_block(*next_extent), extent_count(*next_extent)); + seg[segs++] = *next_extent; + + unsigned count = extent_count(*next_extent); + if (start > dwalk_index(walk)) + count -= start - dwalk_index(walk); + index += count; + } + block_t next_index = limit; + if ((next_extent = dwalk_next(walk))) { + next_index = dwalk_index(walk); + trace("next_index = %Lx", (L)next_index); + if (next_index < start) { + offset = start - next_index; + next_index = start; + } + } + if (index < next_index) { + int gap = next_index - index; + trace("index = %Lx, offset = %Li, next = %Lx, gap = %i", (L)index, (L)offset, (L)next_index, gap); + if (index + gap > limit) + gap = limit - index; + trace("fill gap at %Lx/%x", (L)index, gap); + block_t block = 0; + if (write) { + block = balloc_extent(sb, gap); // goal ??? + if (block == -1) + goto nospace; // clean up !!! + } + seg[segs++] = make_extent(block, gap); + index += gap; + } + } + + if (write) { + while (next_extent) { + trace("save tail"); + seg[segs++] = *next_extent; + next_extent = dwalk_next(walk); + } + } + + printf("segs (offset = %Lx):", (L)offset); + for (i = 0, index = start; i < segs; i++) { + printf(" %Lx => %Lx/%x;", (L)index - offset, (L)extent_block(seg[i]), extent_count(seg[i])); + index += extent_count(seg[i]); + } + printf(" (%i)\n", segs); + + if (write) { + *walk = rewind; + for (i = 0, index = start - offset; i < segs; i++, index += extent_count(seg[i])) + dwalk_mock(walk, index, make_extent(extent_block(seg[i]), extent_count(seg[i]))); + trace("need %i data and %i index bytes", walk->mock.free, -walk->mock.used); + trace("need %i bytes, %u bytes free", walk->mock.free - walk->mock.used, dleaf_free(&tux_inode(inode)->btree, leaf)); + if (dleaf_free(&tux_inode(inode)->btree, leaf) <= walk->mock.free - walk->mock.used) { + trace_on("--------- split leaf ---------"); + assert(!try); + if ((err = btree_leaf_split(&tux_inode(inode)->btree, path, 0))) + goto eek; + try = 1; + goto retry; + } + + *walk = rewind; + if (dleaf_groups(leaf)) + dwalk_chop_after(walk); + for (i = 0, index = start - offset; i < segs; i++) { + trace("pack 0x%Lx => %Lx/%x", (L)index, (L)extent_block(seg[i]), extent_count(seg[i])); + dwalk_pack(walk, index, make_extent(extent_block(seg[i]), extent_count(seg[i]))); + index += extent_count(seg[i]); + } + mark_buffer_dirty(path[tux_inode(inode)->btree.root.depth].buffer); + + //dleaf_dump(&tux_inode(inode)->btree, leaf); + /* assert we used exactly the expected space */ + /* assert(??? == ???); */ + /* check leaf */ + if (0) + goto eek; + } + + unsigned skip = offset; + for (i = 0, index = start - offset; !err && index < limit; i++) { + unsigned count = extent_count(seg[i]); + trace_on("extent 0x%Lx/%x => %Lx", (L)index, count, (L)extent_block(seg[i])); + for (int j = skip; !err && j < count; j++) { + block_t block = extent_block(seg[i]) + j; + buffer = blockget(mapping(inode), index + j); + trace_on("block 0x%Lx => %Lx", (L)bufindex(buffer), (L)block); + if (write) { + err = diskwrite(dev->fd, bufdata(buffer), sb->blocksize, block << dev->bits); + } else { + if (!block) { /* block zero is never allocated */ + trace("zero fill buffer"); + memset(bufdata(buffer), 0, sb->blocksize); + continue; + } + err = diskread(dev->fd, bufdata(buffer), sb->blocksize, block << dev->bits); + } + brelse(set_buffer_uptodate(buffer)); // leave empty if error ??? + } + index += count; + skip = 0; + } + release_path(path, levels + 1); + free_path(path); + return err; +nospace: + err = -ENOSPC; +eek: + warn("could not add extent to tree: %d", err); + release_path(path, levels + 1); + free_path(path); + // free blocks and try to clean up ??? + return -EIO; +} +#else /* __KERNEL__ */ +int tux3_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int blocksize = 1 << inode->i_blkbits; + sector_t last_iblock; + + last_iblock = (i_size_read(inode) + (blocksize - 1)) + >> inode->i_blkbits; + if (iblock >= last_iblock) + return -EIO; + + struct sb *sbi = tux_sb(inode->i_sb); + size_t max_blocks = bh_result->b_size >> inode->i_blkbits; + int levels = tux_inode(inode)->btree.root.depth, err; + if (!levels) { + trace("unmapped block %Lx", (L)iblock); + return 0; + } + + block_t start = iblock, limit = iblock + max_blocks; + struct extent seg[10]; + struct tux_path *path = alloc_path(levels + 1); + if (!path) + return -ENOMEM; + + if ((err = probe(&tux_inode(inode)->btree, start, path))) { + free_path(path); + return err; + } + + //assert(start >= this_key(path, levels)) + /* do not overlap next leaf */ + if (limit > next_key(path, levels)) + limit = next_key(path, levels); + unsigned segs = 0; + struct dleaf *leaf = bufdata(path[levels].buffer); + struct dwalk *walk = &(struct dwalk){ }; + dleaf_dump(&tux_inode(inode)->btree, leaf); + /* Probe below io start to include overlapping extents */ + dwalk_probe(leaf, sbi->blocksize, walk, 0); // start at beginning of leaf just for now + + /* skip extents below start */ + for (struct extent *extent; (extent = dwalk_next(walk));) + if (dwalk_index(walk) + extent_count(*extent) > start) { + if (dwalk_index(walk) <= start) + dwalk_back(walk); + break; + } + + struct extent *next_extent = NULL; + block_t index = start, offset = 0; + while (index < limit && segs < ARRAY_SIZE(seg)) { + trace("index %Lx, limit %Lx", (L)index, (L)limit); + if (next_extent) { + trace("emit %Lx/%x", (L)extent_block(*next_extent), extent_count(*next_extent)); + seg[segs++] = *next_extent; + + unsigned count = extent_count(*next_extent); + if (start > dwalk_index(walk)) + count -= start - dwalk_index(walk); + index += count; + } + block_t next_index = limit; + if ((next_extent = dwalk_next(walk))) { + next_index = dwalk_index(walk); + trace("next_index = %Lx", (L)next_index); + if (next_index < start) { + offset = start - next_index; + next_index = start; + } + } + if (index < next_index) { + /* there is gap (hole), so stop */ + break; + } + } + + block_t block = extent_block(seg[0]) + offset; + size_t count = extent_count(seg[0]) - offset; + for (int i = 1; i < segs; i++) { + if (block + count != extent_block(seg[i])) + break; + count += extent_count(seg[i]); + } + if (block) { + map_bh(bh_result, inode->i_sb, block); + bh_result->b_size = min(max_blocks, count) << sbi->blockbits; + } + trace("%s: block %Lu, size %zu", __func__, + (L)bh_result->b_blocknr, bh_result->b_size); + + release_path(path, levels + 1); + free_path(path); + + return err; +} + +struct buffer_head *blockread(struct address_space *mapping, block_t iblock) +{ + struct inode *inode = mapping->host; + pgoff_t index; + struct page *page; + struct buffer_head *bh; + int offset; + + printk("%s: ==> ino %Lu, block %Lu\n", __func__, tux_inode(inode)->inum, iblock); + + index = iblock >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + offset = iblock & ((PAGE_CACHE_SHIFT - inode->i_blkbits) - 1); + + page = read_mapping_page(mapping, index, NULL); + if (!IS_ERR(page)) { + if (PageError(page)) + goto error; + } + + lock_page(page); + + if (!page_has_buffers(page)) + create_empty_buffers(page, tux_sb(inode->i_sb)->blocksize, 0); + + bh = page_buffers(page); + while (offset--) + bh = bh->b_this_page; + get_bh(bh); + + unlock_page(page); + page_cache_release(page); + + printk("%s: <=== b_blocknr %Lu\n", __func__, (L)bh->b_blocknr); + + return bh; + +error: + page_cache_release(page); + return NULL; +} + +struct buffer_head *blockget(struct address_space *mapping, block_t iblock) +{ + struct inode *inode = mapping->host; + pgoff_t index; + struct page *page; + struct buffer_head *bh; + int offset; + + printk("%s: ino %Lu, block %Lu\n", __func__, tux_inode(inode)->inum, iblock); + + index = iblock >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + offset = iblock & ((PAGE_CACHE_SHIFT - inode->i_blkbits) - 1); + + page = grab_cache_page(mapping, index); + if (!page) + return NULL; + + if (!page_has_buffers(page)) + create_empty_buffers(page, tux_sb(inode->i_sb)->blocksize, 0); + + bh = page_buffers(page); + while (offset--) + bh = bh->b_this_page; + get_bh(bh); + + unlock_page(page); + page_cache_release(page); + + return bh; +} + +static int tux3_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, tux3_get_block); +} + +static int tux3_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, tux3_get_block); +} + +static int tux3_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + *pagep = NULL; + return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + tux3_get_block); +} + +static int tux3_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, tux3_get_block, wbc); +} + +static int tux3_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, tux3_get_block); +} + +static ssize_t tux3_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, + loff_t offset, unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, tux3_get_block, NULL); +} + +static sector_t tux3_bmap(struct address_space *mapping, sector_t iblock) +{ + sector_t blocknr; + + mutex_lock(&mapping->host->i_mutex); + blocknr = generic_block_bmap(mapping, iblock, tux3_get_block); + mutex_unlock(&mapping->host->i_mutex); + + return blocknr; +} + +const struct address_space_operations tux_aops = { + .readpage = tux3_readpage, + .readpages = tux3_readpages, + .writepage = tux3_writepage, + .writepages = tux3_writepages, + .sync_page = block_sync_page, + .write_begin = tux3_write_begin, + .write_end = generic_write_end, + .bmap = tux3_bmap, +// .invalidatepage = ext4_da_invalidatepage, +// .releasepage = ext4_releasepage, + .direct_IO = tux3_direct_IO, + .migratepage = buffer_migrate_page, +// .is_partially_uptodate = block_is_partially_uptodate, +}; + +static int tux3_dir_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page, tux3_get_block); +} + +const struct address_space_operations tux_dir_aops = { + .readpage = tux3_dir_readpage, + .bmap = tux3_bmap, +}; +#endif /* __KERNEL__ */ diff --git a/fs/tux3/hexdump.c b/fs/tux3/hexdump.c new file mode 100644 index 0000000..9b409e9 --- /dev/null +++ b/fs/tux3/hexdump.c @@ -0,0 +1,22 @@ +/* Copyright (c) 2008 Daniel Phillips , GPL v3 */ + +#include "tux3.h" + +void hexdump(void *data, unsigned size) +{ + while (size) { + unsigned char *p; + int w = 16, n = size < w? size: w, pad = w - n; + printf("%p: ", data); + for (p = data; p < (unsigned char *)data + n;) + printf("%02hx ", *p++); + printf("%*.s \"", pad*3, ""); + for (p = data; p < (unsigned char *)data + n;) { + int c = *p++; + printf("%c", c < ' ' || c > 127 ? '.' : c); + } + printf("\"\n"); + data += w; + size -= n; + } +} diff --git a/fs/tux3/iattr.c b/fs/tux3/iattr.c new file mode 100644 index 0000000..e69b352 --- /dev/null +++ b/fs/tux3/iattr.c @@ -0,0 +1,188 @@ +/* + * Inode table attributes + * + * Original copyright (c) 2008 Daniel Phillips + * Portions copyright (c) 2006-2008 Google Inc. + * Licensed under the GPL version 3 + * + * By contributing changes to this file you grant the original copyright holder + * the right to distribute those changes under any license. + */ + +#include "tux3.h" + +/* + * Variable size attribute format: + * + * immediate data: kind+version:16, bytes:16, data[bytes] + * immediate xattr: kind+version:16, bytes:16, atom:16, data[bytes - 2] + */ + +unsigned atsize[MAX_ATTRS] = { + [MODE_OWNER_ATTR] = 12, + [CTIME_SIZE_ATTR] = 14, + [DATA_BTREE_ATTR] = 8, + [LINK_COUNT_ATTR] = 4, + [MTIME_ATTR] = 6, + [IDATA_ATTR] = 2, + [XATTR_ATTR] = 4, +}; + +unsigned encode_asize(unsigned bits) +{ + unsigned need = 0; + for (int kind = MIN_ATTR; kind < VAR_ATTRS; kind++) + if ((bits & (1 << kind))) + need += atsize[kind] + 2; + return need; +} + +int attr_check(void *attrs, unsigned size) +{ + void *limit = attrs + size; + unsigned head; + while (attrs < limit - 1) + { + attrs = decode16(attrs, &head); + unsigned kind = head >> 12; + if (kind < MIN_ATTR || kind >= MAX_ATTRS) + return 0; + if (attrs + atsize[kind] > limit) + return 0; + attrs += atsize[kind]; + } + return 1; +} + +void dump_attrs(struct inode *inode) +{ + //printf("present = %x\n", inode->present); + for (int kind = 0; kind < 32; kind++) { + if (!(tux_inode(inode)->present & (1 << kind))) + continue; + switch (kind) { + case MODE_OWNER_ATTR: + printf("mode 0%.6o uid %x gid %x ", inode->i_mode, inode->i_uid, inode->i_gid); + break; + case DATA_BTREE_ATTR: + printf("root %Lx:%u ", (L)tux_inode(inode)->btree.root.block, tux_inode(inode)->btree.root.depth); + break; + case CTIME_SIZE_ATTR: + printf("ctime %Lx size %Lx ", (L)tuxtime(inode->i_ctime), (L)inode->i_size); + break; + case MTIME_ATTR: + printf("mtime %Lx ", (L)tuxtime(inode->i_mtime)); + break; + case LINK_COUNT_ATTR: + printf("links %u ", inode->i_nlink); + break; + case XATTR_ATTR: + printf("xattr(s) "); + break; + default: + printf("<%i>? ", kind); + break; + } + } + printf("\n"); +} + +void *encode_attrs(struct inode *inode, void *attrs, unsigned size) +{ + //printf("encode %u attr bytes\n", size); + void *limit = attrs + size - 3; + for (int kind = MIN_ATTR; kind < VAR_ATTRS; kind++) { + if (!(tux_inode(inode)->present & (1 << kind))) + continue; + if (attrs >= limit) + break; + attrs = encode_kind(attrs, kind, tux_sb(inode->i_sb)->version); + switch (kind) { + case MODE_OWNER_ATTR: + attrs = encode32(attrs, inode->i_mode); + attrs = encode32(attrs, inode->i_uid); + attrs = encode32(attrs, inode->i_gid); + break; + case CTIME_SIZE_ATTR: + attrs = encode48(attrs, tuxtime(inode->i_ctime) >> TIME_ATTR_SHIFT); + attrs = encode64(attrs, inode->i_size); + break; + case MTIME_ATTR: + attrs = encode48(attrs, tuxtime(inode->i_mtime) >> TIME_ATTR_SHIFT); + break; + case DATA_BTREE_ATTR:; + struct root *root = &tux_inode(inode)->btree.root; + attrs = encode64(attrs, ((u64)root->depth << 48) | root->block); + break; + case LINK_COUNT_ATTR: + attrs = encode32(attrs, inode->i_nlink); + break; + } + } + return attrs; +} + +void *decode_attrs(struct inode *inode, void *attrs, unsigned size) +{ + //printf("decode %u attr bytes\n", size); + u64 v64; + u32 v32; + struct xattr *xattr = tux_inode(inode)->xcache ? tux_inode(inode)->xcache->xattrs : NULL; + void *limit = attrs + size; + while (attrs < limit - 1) { + unsigned head; + attrs = decode16(attrs, &head); + unsigned version = head & 0xfff, kind = head >> 12; + if (version != tux_sb(inode->i_sb)->version) { + attrs += atsize[kind]; + continue; + } + switch (kind) { + case MODE_OWNER_ATTR: + attrs = decode32(attrs, &v32); + inode->i_mode = v32; + attrs = decode32(attrs, &v32); + inode->i_uid = v32; + attrs = decode32(attrs, &v32); + inode->i_gid = v32; + break; + case CTIME_SIZE_ATTR: + attrs = decode48(attrs, &v64); + attrs = decode64(attrs, &inode->i_size); + inode->i_ctime = spectime(v64 << TIME_ATTR_SHIFT); + break; + case MTIME_ATTR: + attrs = decode48(attrs, &v64); + inode->i_mtime = spectime(v64 << TIME_ATTR_SHIFT); + break; + case DATA_BTREE_ATTR: + attrs = decode64(attrs, &v64); + tux_inode(inode)->btree = (struct btree){ .sb = tux_sb(inode->i_sb), .entries_per_leaf = 64, // !!! should depend on blocksize + .ops = &dtree_ops, + .root = { .block = v64 & (-1ULL >> 16), .depth = v64 >> 48 } }; + break; + case LINK_COUNT_ATTR: + attrs = decode32(attrs, &inode->i_nlink); + break; + case XATTR_ATTR:; + // immediate xattr: kind+version:16, bytes:16, atom:16, data[bytes - 2] + unsigned bytes, atom; + attrs = decode16(attrs, &bytes); + attrs = decode16(attrs, &atom); + *xattr = (struct xattr){ .atom = atom, .size = bytes - 2 }; + unsigned xsize = sizeof(struct xattr) + xattr->size; + assert((void *)xattr + xsize <= (void *)tux_inode(inode)->xcache + tux_inode(inode)->xcache->maxsize); + memcpy(xattr->body, attrs, xattr->size); + attrs += xattr->size; + tux_inode(inode)->xcache->size += xsize; + xattr = xcache_next(xattr); // check limit!!! + break; + default: + return NULL; + } + tux_inode(inode)->present |= 1 << kind; + } + if (!(tux_inode(inode)->present & MTIME_BIT)) + inode->i_mtime = inode->i_ctime; + return attrs; +} diff --git a/fs/tux3/ileaf.c b/fs/tux3/ileaf.c new file mode 100644 index 0000000..6982fca --- /dev/null +++ b/fs/tux3/ileaf.c @@ -0,0 +1,286 @@ +/* + * Inode table btree leaf operations + * + * Original copyright (c) 2008 Daniel Phillips + * Licensed under the GPL version 3 + * + * By contributing changes to this file you grant the original copyright holder + * the right to distribute those changes under any license. + */ + +#include "tux3.h" + +struct ileaf { be_u16 magic, count; u32 pad; be_u64 ibase; char table[]; }; + +/* + * inode leaf format + * + * A leaf has a small header followed by a table of attributes. A vector of + * offsets within the block grows down from the top of the leaf towards the + * top of the attribute table, indexed by the difference between inum and + * leaf->ibase, the base inum of the table block. + */ + +static inline unsigned atdict(be_u16 *dict, unsigned at) +{ + return at ? from_be_u16(*(dict - at)) : 0; +} + +static inline void add_idict(be_u16 *dict, int n) +{ + *dict = to_be_u16(from_be_u16(*dict) + n); +} + +static inline unsigned icount(struct ileaf *leaf) +{ + return from_be_u16(leaf->count); +} + +static inline tuxkey_t ibase(struct ileaf *leaf) +{ + return from_be_u64(leaf->ibase); +} + +int ileaf_init(BTREE, vleaf *leaf) +{ + printf("initialize inode leaf %p\n", leaf); + *(struct ileaf *)leaf = (struct ileaf){ to_be_u16(0x90de) }; + return 0; +} + +int ileaf_sniff(BTREE, vleaf *leaf) +{ + return ((struct ileaf *)leaf)->magic == to_be_u16(0x90de); +} + +unsigned ileaf_need(BTREE, vleaf *vleaf) +{ + be_u16 *dict = vleaf + btree->sb->blocksize; + unsigned count = icount(to_ileaf(vleaf)); + return atdict(dict, count) + count * sizeof(*dict); +} + +unsigned ileaf_free(BTREE, vleaf *leaf) +{ + return btree->sb->blocksize - ileaf_need(btree, leaf) - sizeof(struct ileaf); +} + +void ileaf_dump(BTREE, vleaf *vleaf) +{ + SB = btree->sb; + struct ileaf *leaf = vleaf; + inum_t inum = ibase(leaf); + be_u16 *dict = vleaf + sb->blocksize; + unsigned offset = 0; + printf("inode table block 0x%Lx/%i (%x bytes free)\n", (L)ibase(leaf), icount(leaf), ileaf_free(btree, leaf)); + //hexdump(dict - icount(leaf), icount(leaf) * 2); + for (int i = -1; -i <= icount(leaf); i--, inum++) { + int limit = from_be_u16(dict[i]), size = limit - offset; + if (!size) + continue; + printf(" 0x%Lx: ", (L)inum); + //printf("[%x] ", offset); + if (size < 0) + printf("\n"); + else if (!size) + printf("\n"); + else { +#ifndef main + hexdump(leaf->table + offset, size); +#else + struct inode inode = { .i_sb = btree->sb, .xcache = new_xcache(9999) }; + decode_attrs(&inode, leaf->table + offset, size); + dump_attrs(&inode); + xcache_dump(&inode); + free(inode.xcache); +#endif + } + offset = limit; + } +} + +void *ileaf_lookup(BTREE, inum_t inum, struct ileaf *leaf, unsigned *result) +{ + assert(inum >= ibase(leaf)); + assert(inum < ibase(leaf) + btree->entries_per_leaf); + unsigned at = inum - ibase(leaf), size = 0; + void *attrs = NULL; + printf("lookup inode 0x%Lx, %Lx + %x\n", (L)inum, (L)ibase(leaf), at); + if (at < icount(leaf)) { + be_u16 *dict = (void *)leaf + btree->sb->blocksize; + unsigned offset = atdict(dict, at); + if ((size = from_be_u16(*(dict - at - 1)) - offset)) + attrs = leaf->table + offset; + } + *result = size; + return attrs; +} + +int isinorder(BTREE, struct ileaf *leaf) +{ + be_u16 *dict = (void *)leaf + btree->sb->blocksize; + for (int i = 0, offset = 0, limit; --i >= -icount(leaf); offset = limit) + if ((limit = from_be_u16(dict[i])) < offset) + return 0; + return 1; +} + +int ileaf_check(BTREE, struct ileaf *leaf) +{ + char *why; + why = "not an inode table leaf"; + if (leaf->magic != to_be_u16(0x90de)) + goto eek; + why = "dict out of order"; + if (!isinorder(btree, leaf)) + goto eek; + return 0; +eek: + printf("%s!\n", why); + return -1; +} + +void ileaf_trim(BTREE, struct ileaf *leaf) { + be_u16 *dict = (void *)leaf + btree->sb->blocksize; + while (icount(leaf) > 1 && *(dict - icount(leaf)) == *(dict - icount(leaf) + 1)) + leaf->count = to_be_u16(from_be_u16(leaf->count) - 1); + if (icount(leaf) == 1 && !*(dict - 1)) + leaf->count = 0; +} + +#define SPLIT_AT_INUM + +tuxkey_t ileaf_split(BTREE, tuxkey_t inum, vleaf *from, vleaf *into) +{ + assert(ileaf_sniff(btree, from)); + struct ileaf *leaf = from, *dest = into; + be_u16 *dict = from + btree->sb->blocksize, *destdict = into + btree->sb->blocksize; + +#ifdef SPLIT_AT_INUM + printf("split at inum 0x%Lx\n", (L)inum); + assert(inum >= ibase(leaf)); + unsigned at = inum - ibase(leaf) < icount(leaf) ? inum - ibase(leaf) : icount(leaf); +#else + /* binsearch inum starting nearest middle of block */ + unsigned at = 1, hi = icount(leaf); + while (at < hi) { + int mid = (at + hi) / 2; + if (*(dict - mid) < (btree->sb->blocksize / 2)) + at = mid + 1; + else + hi = mid; + } +#endif + /* should trim leading empty inodes on copy */ + unsigned split = atdict(dict, at), free = from_be_u16(*(dict - icount(leaf))); + printf("split at %x of %x\n", at, icount(leaf)); + printf("copy out %x bytes at %x\n", free - split, split); + assert(free >= split); + memcpy(dest->table, leaf->table + split, free - split); + dest->count = to_be_u16(icount(leaf) - at); + veccopy(destdict - icount(dest), dict - icount(leaf), icount(dest)); + for (int i = 1; i <= icount(dest); i++) + add_idict(destdict - i, -split); +#ifdef SPLIT_AT_INUM + /* round down to multiple of 64 above ibase */ + inum_t round = inum & ~(inum_t)(btree->entries_per_leaf - 1); + dest->ibase = to_be_u64(round > ibase(leaf) + icount(leaf) ? round : inum); +#else + dest->ibase = to_be_u64(ibase(leaf) + at); +#endif + leaf->count = to_be_u16(at); + memset(leaf->table + split, 0, (char *)(dict - icount(leaf)) - (leaf->table + split)); + ileaf_trim(btree, leaf); + return ibase(dest); +} + +void ileaf_merge(BTREE, struct ileaf *leaf, struct ileaf *from) +{ + if (!icount(from)) + return; + be_u16 *dict = (void *)leaf + btree->sb->blocksize; + be_u16 *fromdict = (void *)from + btree->sb->blocksize; + unsigned at = icount(leaf), free = atdict(dict, at), size = atdict(fromdict, icount(from)); + printf("copy in %i bytes\n", size); + memcpy(leaf->table + free, from->table, size); + leaf->count = to_be_u16(from_be_u16(leaf->count) + icount(from)); + veccopy(dict - icount(leaf), fromdict - icount(from), icount(from)); + for (int i = at + 1; at && i <= at + icount(from); i++) + add_idict(dict - i, from_be_u16(*(dict - at))); +} + +void *ileaf_resize(BTREE, tuxkey_t inum, vleaf *base, unsigned newsize) +{ + assert(ileaf_sniff(btree, base)); + struct ileaf *leaf = base; + assert(inum >= ibase(leaf)); + be_u16 *dict = base + btree->sb->blocksize; + + unsigned at = inum - ibase(leaf); + if (at >= btree->entries_per_leaf) + return NULL; + + unsigned extend_empty = at < icount(leaf) ? 0 : at - icount(leaf) + 1; + unsigned offset = at && icount(leaf) ? from_be_u16(*(dict - (at < icount(leaf) ? at : icount(leaf)))) : 0; + unsigned size = at < icount(leaf) ? from_be_u16(*(dict - at - 1)) - offset : 0; + int more = newsize - size; + if (more > 0 && sizeof(*dict) * extend_empty + more > ileaf_free(btree, leaf)) + return NULL; + for (; extend_empty--; leaf->count = to_be_u16(from_be_u16(leaf->count) + 1)) + *(dict - icount(leaf) - 1) = to_be_u16(atdict(dict, icount(leaf))); + assert(icount(leaf)); + unsigned itop = from_be_u16(*(dict - icount(leaf))); + void *attrs = leaf->table + offset; + printf("resize inum 0x%Lx at 0x%x from %x to %x\n", (L)inum, offset, size, newsize); + + assert(itop >= offset + size); + memmove(attrs + newsize, attrs + size, itop - offset - size); + for (int i = at + 1; i <= icount(leaf); i++) + add_idict(dict - i, more); + return attrs; +} + +inum_t find_empty_inode(BTREE, struct ileaf *leaf, inum_t goal) +{ + assert(goal >= ibase(leaf)); + goal -= ibase(leaf); + //printf("find empty inode starting at %Lx, base %Lx\n", (L)goal, (L)ibase(leaf)); + be_u16 *dict = (void *)leaf + btree->sb->blocksize; + unsigned i, offset = goal && goal < icount(leaf) ? from_be_u16(*(dict - goal)) : 0; + for (i = goal; i < icount(leaf); i++) { + unsigned limit = from_be_u16(*(dict - i - 1)); + if (offset == limit) + break; + offset = limit; + } + return i + ibase(leaf); +} + +int ileaf_purge(BTREE, inum_t inum, struct ileaf *leaf) +{ + if (inum < ibase(leaf) || inum - ibase(leaf) >= btree->entries_per_leaf) + return -EINVAL; + be_u16 *dict = (void *)leaf + btree->sb->blocksize; + unsigned at = inum - ibase(leaf); + unsigned offset = atdict(dict, at); + unsigned size = from_be_u16(*(dict - at - 1)) - offset; + printf("delete inode %Lx from %p[%x/%x]\n", (L)inum, leaf, at, size); + if (!size) + return -ENOENT; + unsigned free = from_be_u16(*(dict - icount(leaf))), tail = free - offset - size; + assert(offset + size + tail <= free); + memmove(leaf->table + offset, leaf->table + offset + size, tail); + for (int i = at + 1; i <= icount(leaf); i++) + add_idict(dict - i, -size); + ileaf_trim(btree, leaf); + return 0; +} + +struct btree_ops itable_ops = { + .leaf_dump = ileaf_dump, + .leaf_sniff = ileaf_sniff, + .leaf_init = ileaf_init, + .leaf_split = ileaf_split, + .leaf_resize = ileaf_resize, + .balloc = balloc, +}; diff --git a/fs/tux3/inode.c b/fs/tux3/inode.c new file mode 100644 index 0000000..ce41232 --- /dev/null +++ b/fs/tux3/inode.c @@ -0,0 +1,277 @@ +/* + * Tux3 versioning filesystem in user space + * + * Original copyright (c) 2008 Daniel Phillips + * Portions copyright (c) 2006-2008 Google Inc. + * Licensed under the GPL version 3 + * + * By contributing changes to this file you grant the original copyright holder + * the right to distribute those changes under any license. + */ + +#include "tux3.h" + +#ifndef trace +#define trace trace_on +#endif + +int store_attrs(struct inode *inode, struct tux_path path[]) +{ + unsigned size = encode_asize(tux_inode(inode)->present) + encode_xsize(inode); + void *base = tree_expand(&tux_sb(inode->i_sb)->itable, tux_inode(inode)->inum, size, path); + if (!base) + return -ENOMEM; // what was the actual error??? + void *attr = encode_attrs(inode, base, size); + attr = encode_xattrs(inode, attr, base + size - attr); + assert(attr == base + size); + mark_buffer_dirty(path[tux_sb(inode->i_sb)->itable.root.depth].buffer); + return 0; +} + +/* + * Inode table expansion algorithm + * + * First probe for the inode goal. This retreives the rightmost leaf that + * contains an inode less than or equal to the goal. (We could in theory avoid + * retrieving any leaf at all in some cases if we observe that the the goal must + * fall into an unallocated gap between two index keys, for what that is worth. + * Probably not very much.) + * + * If not at end then next key is greater than goal. This block has the highest + * ibase less than or equal to goal. Ibase should be equal to btree key, so + * assert. Search block even if ibase is way too low. If goal comes back equal + * to next_key then there is no room to create more inodes in it, so advance to + * the next block and repeat. + * + * Otherwise, expand the inum goal that came back. If ibase was too low to + * create the inode in that block then the low level split will fail and expand + * will create a new inode table block with ibase at the goal. We round the + * goal down to some binary multiple in ileaf_split to reduce the chance of + * creating inode table blocks with only a small number of inodes. (Actually + * we should only round down the split point, not the returned goal.) + */ + +int make_inode(struct inode *inode, struct tux_iattr *iattr) +{ + SB = tux_sb(inode->i_sb); + int err = -ENOENT, levels = sb->itable.root.depth; + struct tux_path *path = alloc_path(levels + 1); + if (!path) + return -ENOMEM; + + if ((err = probe(&sb->itable, tux_inode(inode)->inum, path))) { + free_path(path); + return err; + } + struct buffer_head *leafbuf = path[levels].buffer; +// struct ileaf *leaf = to_ileaf(bufdata(leafbuf)); + + trace("create inode 0x%Lx", (L)tux_inode(inode)->inum); + assert(!tux_inode(inode)->btree.root.depth); + inum_t inum = tux_inode(inode)->inum; + assert(inum < next_key(path, levels)); + while (1) { +// printf("find empty inode in [%Lx] base %Lx\n", (L)bufindex(leafbuf), (L)ibase(leaf)); + inum = find_empty_inode(&sb->itable, bufdata(leafbuf), (L)inum); + printf("result inum is %Lx, limit is %Lx\n", (L)inum, (L)next_key(path, levels)); + if (inum < next_key(path, levels)) + break; + int more = advance(&sb->itable, path); + printf("no more inode space here, advance %i\n", more); + if (!more) + goto errout; + } + + inode->i_mode = iattr->mode; + inode->i_uid = iattr->uid; + inode->i_gid = iattr->gid; + inode->i_mtime = inode->i_ctime = inode->i_atime = iattr->ctime; + inode->i_nlink = 1; + tux_inode(inode)->inum = inum; + tux_inode(inode)->btree = new_btree(sb, &dtree_ops); // error??? + tux_inode(inode)->present = CTIME_SIZE_BIT|MODE_OWNER_BIT|DATA_BTREE_BIT; + if ((err = store_attrs(inode, path))) + goto eek; + release_path(path, levels + 1); + free_path(path); + return 0; +eek: + release_path(path, levels + 1); +errout: + free_path(path); + warn("make_inode 0x%Lx failed (%d)", (L)tux_inode(inode)->inum, err); + return err; +} + +static int open_inode(struct inode *inode) +{ + SB = tux_sb(inode->i_sb); + int err, levels = sb->itable.root.depth; + struct tux_path *path = alloc_path(levels + 1); + if (!path) + return -ENOMEM; + + if ((err = probe(&sb->itable, tux_inode(inode)->inum, path))) { + free_path(path); + return err; + } + unsigned size; + void *attrs = ileaf_lookup(&sb->itable, tux_inode(inode)->inum, bufdata(path[levels].buffer), &size); + if (!attrs) { + err = -ENOENT; + goto eek; + } + trace("found inode 0x%Lx", (L)tux_inode(inode)->inum); + //ileaf_dump(&sb->itable, path[levels].buffer->data); + //hexdump(attrs, size); + unsigned xsize = decode_xsize(inode, attrs, size); + err = -ENOMEM; + if (!(tux_inode(inode)->xcache = new_xcache(xsize))) // !!! only do this when we hit an xattr !!! + goto eek; + decode_attrs(inode, attrs, size); // error??? + dump_attrs(inode); + if (tux_inode(inode)->xcache) + xcache_dump(inode); + err = 0; +eek: + release_path(path, levels + 1); + free_path(path); + return err; +} + +int save_inode(struct inode *inode) +{ + trace("save inode 0x%Lx", (L)tux_inode(inode)->inum); + SB = tux_sb(inode->i_sb); + int err, levels = sb->itable.root.depth; + struct tux_path *path = alloc_path(levels + 1); + if (!path) + return -ENOMEM; + + if ((err = probe(&sb->itable, tux_inode(inode)->inum, path))) { + free_path(path); + return err; + } + unsigned size; + if (!(ileaf_lookup(&sb->itable, tux_inode(inode)->inum, bufdata(path[levels].buffer), &size))) + return -EINVAL; + err = store_attrs(inode, path); + release_path(path, levels + 1); + free_path(path); + return err; +} + +int purge_inum(BTREE, inum_t inum) +{ + int err = -ENOENT, levels = btree->sb->itable.root.depth; + struct tux_path *path = alloc_path(levels + 1); + if (!path) + return -ENOMEM; + + if (!(err = probe(btree, inum, path))) { + struct ileaf *ileaf = to_ileaf(bufdata(path[levels].buffer)); + err = ileaf_purge(btree, inum, ileaf); + release_path(path, levels + 1); + } + free_path(path); + return err; +} + +#ifdef __KERNEL__ +void tux3_clear_inode(struct inode *inode) +{ + if (tux_inode(inode)->xcache) + kfree(tux_inode(inode)->xcache); +} + +int tux3_write_inode(struct inode *inode, int do_sync) +{ + return save_inode(inode); +} + +static const struct file_operations tux_file_fops = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, +// .unlocked_ioctl = fat_generic_ioctl, +#ifdef CONFIG_COMPAT +// .compat_ioctl = fat_compat_dir_ioctl, +#endif + .mmap = generic_file_mmap, + .open = generic_file_open, +// .fsync = file_fsync, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, +}; + +static const struct inode_operations tux_file_iops = { +// .truncate = ext4_truncate, +// .permission = ext4_permission, +// .setattr = ext4_setattr, +// .getattr = ext4_getattr +#ifdef CONFIG_EXT4DEV_FS_XATTR +// .setxattr = generic_setxattr, +// .getxattr = generic_getxattr, +// .listxattr = ext4_listxattr, +// .removexattr = generic_removexattr, +#endif +// .fallocate = ext4_fallocate, +// .fiemap = ext4_fiemap, +}; + +struct inode *tux3_iget(struct super_block *sb, inum_t inum) +{ + struct sb *sbi = tux_sb(sb); + struct inode *inode; + int err; + + /* FIXME: inum is 64bit, ino is unsigned long */ + inode = iget_locked(sb, inum); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + tux_inode(inode)->inum = inum; + err = open_inode(inode); + if (err) { + iget_failed(inode); + return ERR_PTR(err); + } + + inode->i_ino = inum; /* FIXME: will overflow on 32bit arch */ + inode->i_version = 1; + inode->i_blocks = ((inode->i_size + sbi->blockmask) + & ~(loff_t)sbi->blockmask) >> 9; +// inode->i_generation = 0; +// inode->i_flags = 0; + + switch (inode->i_mode & S_IFMT) { + default: +// inode->i_op = &tux3_special_inode_operations; +// init_special_inode(inode, inode->i_mode, new_decode_dev(dev)); + break; + case S_IFREG: + inode->i_op = &tux_file_iops; + inode->i_fop = &tux_file_fops; + inode->i_mapping->a_ops = &tux_aops; + break; + case S_IFDIR: + inode->i_op = &tux_dir_iops; + inode->i_fop = &tux_dir_fops; + inode->i_mapping->a_ops = &tux_dir_aops; +// mapping_set_gfp_mask(inode->i_mapping, GFP_USER_PAGECACHE); + mapping_set_gfp_mask(inode->i_mapping, GFP_USER); + break; + case S_IFLNK: +// inode->i_op = &tux_symlink_iops; +// inode->i_mapping->a_ops = &tux_aops; + break; + } + + unlock_new_inode(inode); + return inode; +} +#endif /* !__KERNEL__ */ diff --git a/fs/tux3/super.c b/fs/tux3/super.c new file mode 100644 index 0000000..2e5122e --- /dev/null +++ b/fs/tux3/super.c @@ -0,0 +1,268 @@ +/* + * tux3/super.c + * Copyright (c) 2008, Daniel Phillips + * Portions copyright (c) 2008, Maciej Zenczykowski + */ + +#include +#include +#include +#include +#include +#include + +#include "tux3.h" + +static struct kmem_cache *tux_inode_cachep; + +static void tux3_inode_init_once(struct kmem_cache *cachep, void *foo) +{ + struct tux_inode *tuxi = (struct tux_inode *)foo; + inode_init_once(&tuxi->vfs_inode); +} + +static int __init tux3_init_inodecache(void) +{ + tux_inode_cachep = kmem_cache_create("tux_inode_cache", + sizeof(struct tux_inode), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + tux3_inode_init_once); + if (tux_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void __exit tux3_destroy_inodecache(void) +{ + kmem_cache_destroy(tux_inode_cachep); +} + +static struct inode *tux3_alloc_inode(struct super_block *sb) +{ + struct tux_inode *tuxi; + tuxi = (struct tux_inode *)kmem_cache_alloc(tux_inode_cachep, GFP_KERNEL); + if (!tuxi) + return NULL; + return &tuxi->vfs_inode; +} + +static void tux3_destroy_inode(struct inode *inode) +{ + kmem_cache_free(tux_inode_cachep, tux_inode(inode)); +} + +static int tux_load_sb(struct super_block *sb, int silent) +{ + struct sb *sbi = tux_sb(sb); + struct buffer_head *bh = NULL; + int err; + + err = -EIO; + bh = sb_bread(sb, SB_LOC >> sb->s_blocksize_bits); + if (!bh) { + if (!silent) + printk(KERN_ERR "TUX3: unable to read superblock\n"); + goto error; + } + + err = -EINVAL; + struct disksuper *disk = (struct disksuper *)bh->b_data; + if (memcmp(disk->magic, (char[])SB_MAGIC, sizeof(disk->magic))) { + if (!silent) + printk(KERN_ERR "TUX3: invalid superblock [%Lx]", + (L)from_be_u64(*(be_u64 *)disk->magic)); + goto error; + } + + int blockbits = from_be_u16(disk->blockbits); + u64 iroot = from_be_u64(disk->iroot); + + sbi->itable = (struct btree){ + .sb = sbi, + .ops = &itable_ops, + .root = (struct root){ + .depth = iroot >> 48, + .block = iroot & (-1ULL >> 16) + }, + .entries_per_leaf = 1 << (blockbits - 6), + }; +// sbi->rootbuf; + sbi->blockbits = blockbits; + sbi->blocksize = 1 << blockbits; + sbi->blockmask = (1 << blockbits) - 1; + sbi->volblocks = from_be_u64(disk->volblocks); + sbi->freeblocks = from_be_u64(disk->freeblocks); + sbi->nextalloc = from_be_u64(disk->nextalloc); + sbi->atomgen = from_be_u32(disk->atomgen); + sbi->freeatom = from_be_u32(disk->freeatom); + + brelse(bh); + + return 0; + +error: + brelse(bh); + return err; +} + +static void tux3_put_super(struct super_block *sb) +{ + struct sb *sbi = tux_sb(sb); + + iput(sbi->atable); + iput(sbi->bitmap); + + sb->s_fs_info = NULL; + kfree(sbi); +} + +static int tux3_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct sb *sbi = tux_sb(sb); + + buf->f_type = sb->s_magic; + buf->f_bsize = sbi->blocksize; + buf->f_blocks = sbi->volblocks; + buf->f_bfree = sbi->freeblocks; + buf->f_bavail = sbi->freeblocks; +#if 0 + buf->f_files = buf->f_blocks << (sbi->clus_bits - EXFAT_CHUNK_BITS) / 3; + buf->f_ffree = buf->f_blocks << (sbi->clus_bits - EXFAT_CHUNK_BITS) / 3; + buf->f_fsid.val[0] = sbi->serial_number; + /*buf->f_fsid.val[1];*/ +#endif + buf->f_namelen = TUX_NAME_LEN; +// buf->f_frsize = sbi->blocksize; + + return 0; +} + +static const struct super_operations tux3_super_ops = { + .alloc_inode = tux3_alloc_inode, + .destroy_inode = tux3_destroy_inode, + .put_super = tux3_put_super, + .statfs = tux3_statfs, + .clear_inode = tux3_clear_inode, + .write_inode = tux3_write_inode, +}; + +static int tux3_fill_super(struct super_block *sb, void *data, int silent) +{ + struct sb *sbi; + int err, blocksize; + + sbi = kzalloc(sizeof(struct sb), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + sb->s_fs_info = sbi; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_magic = 0x54555833; + sb->s_op = &tux3_super_ops; + sb->s_time_gran = 1; + + sbi->vfs_sb = sb; + sbi->entries_per_node = 20; +// sbi->max_inodes_per_block = 64; +// sbi->version; +// sbi->atomref_base; +// sbi->unatom_base; + + err = -EIO; + blocksize = sb_min_blocksize(sb, BLOCK_SIZE); + if (!blocksize) { + if (!silent) + printk(KERN_ERR "TUX3: unable to set blocksize\n"); + goto error; + } + + err = tux_load_sb(sb, silent); + if (err) + goto error; + printk("%s: sb %p, ops %p, depth %Lu, block %Lu, entries_per_leaf %d\n", + __func__, + sbi->itable.sb, sbi->itable.ops, + (L)sbi->itable.root.depth, (L)sbi->itable.root.block, + sbi->itable.entries_per_leaf); + printk("%s: blocksize %u, blockbits %u, blockmask %08x\n", + __func__, sbi->blocksize, sbi->blockbits, sbi->blockmask); + printk("%s: volblocks %Lu, freeblocks %Lu, nextalloc %Lu\n", + __func__, sbi->volblocks, sbi->freeblocks, sbi->nextalloc); + printk("%s: freeatom %u, atomgen %u\n", + __func__, sbi->freeatom, sbi->atomgen); + + if (sbi->blocksize != blocksize) { + if (!sb_set_blocksize(sb, sbi->blocksize)) { + printk(KERN_ERR "TUX3: blocksize too small for device.\n"); + goto error; + } + } + printk("%s: s_blocksize %lu\n", __func__, sb->s_blocksize); + +// struct inode *vtable; + sbi->bitmap = tux3_iget(sb, TUX_BITMAP_INO); + err = PTR_ERR(sbi->bitmap); + if (IS_ERR(sbi->bitmap)) + goto error; + + sbi->rootdir = tux3_iget(sb, TUX_ROOTDIR_INO); + err = PTR_ERR(sbi->rootdir); + if (IS_ERR(sbi->rootdir)) + goto error_bitmap; + + sbi->atable = tux3_iget(sb, TUX_ATABLE_INO); + err = PTR_ERR(sbi->atable); + if (IS_ERR(sbi->atable)) + goto error_rootdir; + + err = -ENOMEM; + sb->s_root = d_alloc_root(sbi->rootdir); + if (!sb->s_root) + goto error_atable; + + return 0; + +error_atable: + iput(sbi->atable); +error_rootdir: + iput(sbi->rootdir); +error_bitmap: + iput(sbi->bitmap); +error: + kfree(sbi); + return err; +} + +static int tux3_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, struct vfsmount *mnt) +{ + return get_sb_bdev(fs_type, flags, dev_name, data, tux3_fill_super, mnt); +} + +static struct file_system_type tux3_fs_type = { + .owner = THIS_MODULE, + .name = "tux3", + .fs_flags = FS_REQUIRES_DEV, + .get_sb = tux3_get_sb, + .kill_sb = kill_block_super, +}; + +static int __init init_tux3(void) +{ + int err = tux3_init_inodecache(); + if (err) + return err; + return register_filesystem(&tux3_fs_type); +} + +static void __exit exit_tux3(void) +{ + unregister_filesystem(&tux3_fs_type); + tux3_destroy_inodecache(); +} + +module_init(init_tux3) +module_exit(exit_tux3) +MODULE_LICENSE("GPL"); diff --git a/fs/tux3/trace.h b/fs/tux3/trace.h new file mode 100644 index 0000000..34bf551 --- /dev/null +++ b/fs/tux3/trace.h @@ -0,0 +1,23 @@ +#ifndef TRACE_H +#define TRACE_H + +#ifdef __KERNEL__ +#define printf printk +#define vprintf vprintk + +#define die(code) BUG_ON(1) +#endif + +#define logline(caller, fmt, args...) do { \ + printf("%s: ", caller); \ + printf(fmt , ##args); \ + printf("\n"); \ +} while (0) + +#define error(string, args...) ({ warn(string "!", ##args); die(99); 1; }) +#define assert(expr) do { if (!(expr)) error("Failed assertion \"%s\"", #expr); } while (0) +#define warn(string, args...) do { logline(__func__, string, ##args); } while (0) +#define trace_off(...) do {} while (0) +#define trace_on(fmt, args...) warn(fmt, ## args) + +#endif diff --git a/fs/tux3/tux3.h b/fs/tux3/tux3.h new file mode 100644 index 0000000..07aa927 --- /dev/null +++ b/fs/tux3/tux3.h @@ -0,0 +1,660 @@ +#ifndef TUX3_H +#define TUX3_H + +#ifdef __KERNEL__ +#include +#include +#include +#include + +typedef loff_t block_t; + +#include "trace.h" +#endif + +typedef long long L; // widen for printf on 64 bit systems + +#define PACKED __attribute__ ((packed)) +#define fieldtype(compound, field) typeof(((compound *)NULL)->field) +#define vecset(d, v, n) memset((d), (v), (n) * sizeof(*(d))) +#define veccopy(d, s, n) memcpy((d), (s), (n) * sizeof(*(d))) +#define vecmove(d, s, n) memmove((d), (s), (n) * sizeof(*(d))) + +typedef u64 fixed32; /* Tux3 time values */ +typedef u32 millisecond_t; +typedef u64 inum_t; +typedef u64 tuxkey_t; + +/* Endian support */ + +typedef u16 __bitwise be_u16; +typedef u32 __bitwise be_u32; +typedef u64 __bitwise be_u64; + +#ifdef __KERNEL__ +static inline u16 from_be_u16(be_u16 val) +{ + return __be16_to_cpu(val); +} + +static inline u32 from_be_u32(be_u32 val) +{ + return __be32_to_cpu(val); +} + +static inline u64 from_be_u64(be_u64 val) +{ + return __be64_to_cpu(val); +} + +static inline be_u16 to_be_u16(u16 val) +{ + return __cpu_to_be16(val); +} + +static inline be_u32 to_be_u32(u32 val) +{ + return __cpu_to_be32(val); +} + +static inline be_u64 to_be_u64(u64 val) +{ + return __cpu_to_be64(val); +} +#else +static inline u16 from_be_u16(be_u16 val) +{ + return bswap_16((__force u16)val); +} + +static inline u32 from_be_u32(be_u32 val) +{ + return bswap_32((__force u32)val); +} + +static inline u64 from_be_u64(be_u64 val) +{ + return bswap_64((__force u64)val); +} + +static inline be_u16 to_be_u16(u16 val) +{ + return (__force be_u16)bswap_16(val); +} + +static inline be_u32 to_be_u32(u32 val) +{ + return (__force be_u32)bswap_32(val); +} + +static inline be_u64 to_be_u64(u64 val) +{ + return (__force be_u64)bswap_64(val); +} +#endif + +static inline void *encode16(void *at, unsigned val) +{ + *(be_u16 *)at = to_be_u16(val); + return at + sizeof(u16); +} + +static inline void *encode32(void *at, unsigned val) +{ + *(be_u32 *)at = to_be_u32(val); + return at + sizeof(u32); +} + +static inline void *encode64(void *at, u64 val) +{ + *(be_u64 *)at = to_be_u64(val); + return at + sizeof(u64); +} + +static inline void *encode48(void *at, u64 val) +{ + at = encode16(at, val >> 32); + return encode32(at, val); +} + +static inline void *decode16(void *at, unsigned *val) +{ + *val = from_be_u16(*(be_u16 *)at); + return at + sizeof(u16); +} + +static inline void *decode32(void *at, unsigned *val) +{ + *val = from_be_u32(*(be_u32 *)at); + return at + sizeof(u32); +} + +static inline void *decode64(void *at, u64 *val) +{ + *val = from_be_u64(*(be_u64 *)at); + return at + sizeof(u64); +} + +static inline void *decode48(void *at, u64 *val) +{ + unsigned part1, part2; + at = decode16(at, &part1); + at = decode32(at, &part2); + *val = (u64)part1 << 32 | part2; + return at; +} + +/* Tux3 disk format */ +#define SB_MAGIC_SIZE 8 +#define SB_MAGIC { 't', 'u', 'x', '3', 0xdd, 0x08, 0x09, 0x06 } /* date of latest incompatible sb format */ +/* + * disk format revision history + * !!! always update this for every incompatible change !!! + * + * 2008-08-06: Beginning of time + * 2008-09-06: Actual checking starts + */ + +#define MAX_INODES_BITS 48 +#define MAX_BLOCKS_BITS 48 +#define MAX_FILESIZE_BITS 60 +#define MAX_FILESIZE (1LL << MAX_FILESIZE_BITS) +#define MAX_EXTENT (1 << 6) +#define SB_LOC (1 << 12) +#define SB struct sb *sb + +/* Special inode numbers */ +#define TUX_BITMAP_INO 0 +#define TUX_VTABLE_INO 2 +#define TUX_ATABLE_INO 10 +#define TUX_ROOTDIR_INO 13 + +struct disksuper +{ + char magic[SB_MAGIC_SIZE]; + be_u64 birthdate; + be_u64 flags; + be_u64 iroot; + be_u64 aroot; + be_u16 blockbits; + be_u16 unused1; + be_u32 unused2; + be_u64 volblocks, freeblocks, nextalloc; + be_u32 freeatom, atomgen; +}; + +struct root { u64 depth:16, block:48; }; + +struct btree { + struct sb *sb; + struct btree_ops *ops; + struct root root; + u16 entries_per_leaf; +}; + +struct tux_path { struct buffer_head *buffer; struct index_entry *next; }; + +struct sb { + struct disksuper super; + + struct btree itable; + struct buffer_head *rootbuf; + struct inode *bitmap, *rootdir, *vtable, *atable; + unsigned blocksize, blockbits, blockmask; + block_t volblocks, freeblocks, nextalloc; + unsigned entries_per_node, max_inodes_per_block; + unsigned version, atomref_base, unatom_base; + unsigned freeatom, atomgen; +#ifdef __KERNEL__ + struct super_block *vfs_sb; +#else + map_t *devmap; +#endif +}; + +#ifdef __KERNEL__ +struct tux_inode { + struct btree btree; + inum_t inum; + unsigned present; + struct xcache *xcache; + + struct inode vfs_inode; +}; + +static inline struct sb *tux_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct super_block *vfs_sb(struct sb *sb) +{ + return sb->vfs_sb; +} + +static inline struct tux_inode *tux_inode(struct inode *inode) +{ + return container_of(inode, struct tux_inode, vfs_inode); +} + +typedef struct address_space map_t; + +static inline map_t *mapping(struct inode *inode) +{ + return inode->i_mapping; +} + +static inline void *malloc(size_t size) +{ + might_sleep(); + return kmalloc(size, GFP_NOFS); +} + +static inline void free(void *ptr) +{ + kfree(ptr); +} +#else +struct inode { + struct btree btree; + inum_t inum; + unsigned present; + struct xcache *xcache; + + struct sb *i_sb; + map_t *map; + u64 i_size; + unsigned i_version; + struct timespec i_mtime, i_ctime, i_atime; + unsigned i_mode, i_uid, i_gid, i_nlink; +}; + +struct file { + struct inode *f_inode; + unsigned f_version; + loff_t f_pos; +}; + +static inline struct sb *tux_sb(struct sb *sb) +{ + return sb; +} + +static inline struct sb *vfs_sb(struct sb *sb) +{ + return sb; +} + +static inline struct inode *tux_inode(struct inode *inode) +{ + return inode; +} + +static inline map_t *mapping(struct inode *inode) +{ + return inode->map; +} +#endif /* !__KERNEL__ */ + +#define TUX_NAME_LEN 255 + +/* directory entry */ +typedef struct { + be_u32 inum; + be_u16 rec_len; + u8 name_len, type; + char name[]; +} tux_dirent; + +/* version:10, count:6, block:48 */ +struct extent { be_u64 block_count_version; }; +/* count:8, keyhi:24 */ +struct group { be_u32 count_and_keyhi; }; +/* limit:8, keylo:24 */ +struct entry { be_u32 limit_and_keylo; }; +struct dleaf { be_u16 magic, groups, free, used; struct extent table[]; }; + +struct dwalk { + struct dleaf *leaf; + struct group *group, *gstop, *gdict; + struct entry *entry, *estop; + struct extent *exbase, *extent, *exstop; + struct { + struct group group; + struct entry entry; + int used, free, groups; + } mock; +}; + +/* group wrappers */ + +static inline struct group make_group(tuxkey_t keyhi, unsigned count) +{ + return (struct group){ to_be_u32(keyhi | (count << 24)) }; +} + +static inline unsigned group_keyhi(struct group *group) +{ + return from_be_u32(*(be_u32 *)group) & 0xffffff; +} + +static inline unsigned group_count(struct group *group) +{ + return *(unsigned char *)group; +} + +static inline void set_group_count(struct group *group, int n) +{ + *(unsigned char *)group = n; +} + +static inline void inc_group_count(struct group *group, int n) +{ + *(unsigned char *)group += n; +} + +/* entry wrappers */ + +static inline struct entry make_entry(tuxkey_t keylo, unsigned limit) +{ + return (struct entry){ to_be_u32(keylo | (limit << 24)) }; +} + +static inline unsigned entry_keylo(struct entry *entry) +{ + return from_be_u32(*(be_u32 *)entry) & ~(-1 << 24); +} + +static inline unsigned entry_limit(struct entry *entry) +{ + return *(unsigned char *)entry; +} + +static inline void inc_entry_limit(struct entry *entry, int n) +{ + *(unsigned char *)entry += n; +} + +/* extent wrappers */ + +static inline struct extent make_extent(block_t block, unsigned count) +{ + assert(block < (1ULL << 48) && count - 1 < (1 << 6)); + return (struct extent){ to_be_u64(((u64)(count - 1) << 48) | block) }; +} + +static inline unsigned extent_block(struct extent extent) +{ + return from_be_u64(*(be_u64 *)&extent) & ~(-1LL << 48); +} + +static inline unsigned extent_count(struct extent extent) +{ + return ((from_be_u64(*(be_u64 *)&extent) >> 48) & 0x3f) + 1; +} + +static inline unsigned extent_version(struct extent extent) +{ + return from_be_u64(*(be_u64 *)&extent) >> 54; +} + +/* dleaf wrappers */ + +static inline unsigned dleaf_groups(struct dleaf *leaf) +{ + return from_be_u16(leaf->groups); +} + +static inline void set_dleaf_groups(struct dleaf *leaf, int n) +{ + leaf->groups = to_be_u16(n); +} + +static inline void inc_dleaf_groups(struct dleaf *leaf, int n) +{ + leaf->groups = to_be_u16(from_be_u16(leaf->groups) + n); +} + +typedef void vleaf; + +#define BTREE struct btree *btree + +struct btree_ops { + int (*leaf_sniff)(BTREE, vleaf *leaf); + int (*leaf_init)(BTREE, vleaf *leaf); + tuxkey_t (*leaf_split)(BTREE, tuxkey_t key, vleaf *from, vleaf *into); + void *(*leaf_resize)(BTREE, tuxkey_t key, vleaf *leaf, unsigned size); + void (*leaf_dump)(BTREE, vleaf *leaf); + unsigned (*leaf_need)(BTREE, vleaf *leaf); + unsigned (*leaf_free)(BTREE, vleaf *leaf); + int (*leaf_chop)(BTREE, tuxkey_t key, vleaf *leaf); + void (*leaf_merge)(BTREE, vleaf *into, vleaf *from); + block_t (*balloc)(SB); + void (*bfree)(SB, block_t block); +}; + +/* + * Tux3 times are 32.32 fixed point while time attributes are stored in 32.16 + * format, trading away some precision to compress time fields by two bytes + * each. It is not clear whether the saved space is worth the lower precision. + */ +#define TIME_ATTR_SHIFT 16 + +static inline u32 high32(fixed32 val) +{ + return val >> 32; +} + +static inline unsigned billionths(fixed32 val) +{ + return (((val & 0xffffffff) * 1000000000) + 0x80000000) >> 32; +} + +static inline struct timespec spectime(fixed32 time) +{ + return (struct timespec){ .tv_sec = high32(time), .tv_nsec = billionths(time) }; +} + +static inline fixed32 tuxtime(struct timespec time) +{ + return ((u64)time.tv_sec << 32) + ((u64)time.tv_nsec << 32) / 1000000000; +} + +static inline struct timespec gettime(void) +{ +#ifdef __KERNEL__ + return current_kernel_time(); +#else + struct timeval now; + gettimeofday(&now, NULL); + return (struct timespec){ .tv_sec = now.tv_sec, .tv_nsec = now.tv_usec * 1000 }; +#endif +} + +struct tux_iattr { + u64 isize; + struct timespec mtime, ctime, atime; + unsigned mode, uid, gid, links; +}; + +void hexdump(void *data, unsigned size); +block_t balloc(SB); +void bfree(SB, block_t block); + +enum atkind { + MIN_ATTR = 6, + MODE_OWNER_ATTR = 6, + DATA_BTREE_ATTR = 7, + CTIME_SIZE_ATTR = 8, + LINK_COUNT_ATTR = 9, + MTIME_ATTR = 10, + IDATA_ATTR = 11, + XATTR_ATTR = 12, + MAX_ATTRS, + VAR_ATTRS = IDATA_ATTR +}; + +enum atbit { + MODE_OWNER_BIT = 1 << MODE_OWNER_ATTR, + CTIME_SIZE_BIT = 1 << CTIME_SIZE_ATTR, + DATA_BTREE_BIT = 1 << DATA_BTREE_ATTR, + LINK_COUNT_BIT = 1 << LINK_COUNT_ATTR, + MTIME_BIT = 1 << MTIME_ATTR, + IDATA_BIT = 1 << IDATA_ATTR, + XATTR_BIT = 1 << XATTR_ATTR, +}; + +extern unsigned atsize[MAX_ATTRS]; + +struct xattr { u16 atom, size; char body[]; }; +struct xcache { u16 size, maxsize; struct xattr xattrs[]; }; + +static inline struct xattr *xcache_next(struct xattr *xattr) +{ + return (void *)xattr->body + xattr->size; +} + +static inline struct xattr *xcache_limit(struct xcache *xcache) +{ + return (void *)xcache + xcache->size; +} + +static inline void *encode_kind(void *attrs, unsigned kind, unsigned version) +{ + return encode16(attrs, (kind << 12) | version); +} + +struct ileaf; +static inline struct ileaf *to_ileaf(vleaf *leaf) +{ + return leaf; +} + +/* for tree_chop */ +struct delete_info { + tuxkey_t key; + block_t blocks, freed; + block_t resume; + int create; +}; + +#ifdef __KERNEL__ +static inline void *bufdata(struct buffer_head *buffer) +{ + return buffer->b_data; +} + +static inline size_t bufsize(struct buffer_head *buffer) +{ + return buffer->b_size; +} + +static inline block_t bufindex(struct buffer_head *buffer) +{ + return buffer->b_blocknr; +} + +static inline int bufcount(struct buffer_head *buffer) +{ + return atomic_read(&buffer->b_count); +} + +#define bufmap(map) NULL // just ignore this until we have peekblk + +static inline struct inode *buffer_inode(struct buffer_head *buffer) +{ + return buffer->b_page->mapping->host; +} + +/* balloc.c */ +block_t balloc_extent(SB, unsigned blocks); + +/* btree.c */ +void release_path(struct tux_path path[], int levels); +struct tux_path *alloc_path(int); +void free_path(struct tux_path path[]); +int probe(BTREE, tuxkey_t key, struct tux_path path[]); +int advance(BTREE, struct tux_path path[]); +tuxkey_t next_key(struct tux_path path[], int levels); +void show_tree_range(BTREE, tuxkey_t start, unsigned count); +int tree_chop(BTREE, struct delete_info *info, millisecond_t deadline); +int btree_leaf_split(struct btree *btree, struct tux_path path[], tuxkey_t key); +void *tree_expand(struct btree *btree, tuxkey_t key, unsigned newsize, struct tux_path path[]); +struct btree new_btree(SB, struct btree_ops *ops); + +/* dir.c */ +loff_t tux_create_entry(struct inode *dir, const char *name, int len, unsigned inum, unsigned mode); +tux_dirent *tux_find_entry(struct inode *dir, const char *name, int len, struct buffer_head **result); +int tux_delete_entry(struct buffer_head *buffer, tux_dirent *entry); +extern const struct file_operations tux_dir_fops; +extern const struct inode_operations tux_dir_iops; + +/* dtree.c */ +unsigned dleaf_free(BTREE, vleaf *leaf); +void dleaf_dump(BTREE, vleaf *vleaf); +int dwalk_probe(struct dleaf *leaf, unsigned blocksize, struct dwalk *walk, tuxkey_t key); +tuxkey_t dwalk_index(struct dwalk *walk); +struct extent *dwalk_next(struct dwalk *walk); +void dwalk_back(struct dwalk *walk); +void dwalk_chop_after(struct dwalk *walk); +int dwalk_mock(struct dwalk *walk, tuxkey_t index, struct extent extent); +int dwalk_pack(struct dwalk *walk, tuxkey_t index, struct extent extent); +extern struct btree_ops dtree_ops; + +/* filemap.c */ +extern const struct address_space_operations tux_aops; +extern const struct address_space_operations tux_dir_aops; + +/* iattr.c */ +unsigned encode_asize(unsigned bits); +void dump_attrs(struct inode *inode); +void *encode_attrs(struct inode *inode, void *attrs, unsigned size); +void *decode_attrs(struct inode *inode, void *attrs, unsigned size); + +/* ileaf.c */ +void *ileaf_lookup(BTREE, inum_t inum, struct ileaf *leaf, unsigned *result); +inum_t find_empty_inode(BTREE, struct ileaf *leaf, inum_t goal); +int ileaf_purge(BTREE, inum_t inum, struct ileaf *leaf); +extern struct btree_ops itable_ops; + +/* inode.c */ +void tux3_clear_inode(struct inode *inode); +int tux3_write_inode(struct inode *inode, int do_sync); +struct inode *tux3_iget(struct super_block *sb, inum_t inum); + +/* xattr.c */ +int xcache_dump(struct inode *inode); +struct xcache *new_xcache(unsigned maxsize); +struct xattr *get_xattr(struct inode *inode, char *name, unsigned len); +int set_xattr(struct inode *inode, char *name, unsigned len, void *data, unsigned size); +void *encode_xattrs(struct inode *inode, void *attrs, unsigned size); +unsigned decode_xsize(struct inode *inode, void *attrs, unsigned size); +unsigned encode_xsize(struct inode *inode); + +/* temporary hack for buffer */ +struct buffer_head *blockread(struct address_space *mapping, block_t iblock); +struct buffer_head *blockget(struct address_space *mapping, block_t iblock); + +static inline int buffer_empty(struct buffer_head *buffer) +{ + return 1; +} + +static inline struct buffer_head *set_buffer_empty(struct buffer_head *buffer) +{ + return buffer; +} + +static inline void brelse_dirty(struct buffer_head *buffer) +{ + mark_buffer_dirty(buffer); + brelse(buffer); +} +#else /* !__KERNEL__ */ +static inline struct inode *buffer_inode(struct buffer_head *buffer) +{ + return buffer->map->inode; +} +#endif /* !__KERNEL__ */ + +#endif diff --git a/fs/tux3/xattr.c b/fs/tux3/xattr.c new file mode 100644 index 0000000..53314e9 --- /dev/null +++ b/fs/tux3/xattr.c @@ -0,0 +1,414 @@ +/* + * Tux3 versioning filesystem in user space + * + * Original copyright (c) 2008 Daniel Phillips + * Licensed under the GPL version 3 + * + * By contributing changes to this file you grant the original copyright holder + * the right to distribute those changes under any license. + */ + +#include "tux3.h" + +#ifndef trace +#define trace trace_on +#endif + +/* Xattr Atoms */ + +/* + * Atom count table: + * + * * Both tables are mapped into the atom table at a high logical offset. + * Allowing 32 bits worth of atom numbers, and with at most 256 atom entries + * per 4K dirent block, we need about (32 << 8) = 1 TB dirent bytes for the + * atom dictionary, so the refcount tables start at block 2^40 >> 12 = 2^28. + * + * * The refcount table consists of pairs of blocks: even blocks with the low + * 16 bits of refcount and odd blocks with the high 16 bits. For 2^32 atoms + * that is 2^34 bytes at most, or 2^22 4K blocks. + * + * Atom reverse map: + * + * * When a new atom dirent is created we also set the reverse map for the + * dirent's atom number to the file offset at which the dirent was created. + * This will be 64 bits just to be lazy so that is 2^32 atoms * 8 bytes + * = 2^35 revmap bytes = 2^23 4K blocks. This starts just above the count + * table, which puts it at logical offset 2^28 + 2^23, leaving a gap after + * the count table in case we decide 32 bits of ref count is not enough. + */ + +typedef u32 atom_t; + +static inline atom_t entry_atom(tux_dirent *entry) +{ + return from_be_u32(entry->inum); +} + +struct buffer_head *blockread_unatom(struct inode *atable, atom_t atom, unsigned *offset) +{ + unsigned shift = tux_sb(atable->i_sb)->blockbits - 3; + *offset = atom & ~(-1 << shift); + return blockread(mapping(atable), tux_sb(atable->i_sb)->unatom_base + (atom >> shift)); +} + +void dump_atoms(struct inode *atable) +{ + SB = tux_sb(atable->i_sb); + unsigned blocks = (sb->atomgen + (sb->blockmask >> 1)) >> (sb->blockbits - 1); + for (unsigned j = 0; j < blocks; j++) { + unsigned block = sb->atomref_base + 2 * j; + struct buffer_head *lobuf, *hibuf; + if (!(lobuf = blockread(mapping(atable), block))) + goto eek; + if (!(hibuf = blockread(mapping(atable), block))) + goto eek; + be_u16 *lorefs = bufdata(lobuf), *hirefs = bufdata(hibuf); + for (unsigned i = 0; i < (sb->blocksize >> 1); i++) { + unsigned refs = (from_be_u16(hirefs[i]) << 16) + from_be_u16(lorefs[i]); + if (!refs) + continue; + atom_t atom = i; + unsigned offset; + struct buffer_head *buffer = blockread_unatom(atable, atom, &offset); + if (!buffer) + goto eek; + u64 where = from_be_u64(((be_u64 *)bufdata(buffer))[offset]); + brelse(buffer); + buffer = blockread(mapping(atable), where >> sb->blockbits); + if (!buffer) + goto eek; + tux_dirent *entry = bufdata(buffer) + (where & sb->blockmask); + if (entry_atom(entry) != atom) { + warn("atom %x reverse entry broken", atom); + continue; + } + printf("{%.*s} = %x\n", entry->name_len, entry->name, atom); + brelse(buffer); + } + brelse(lobuf); + brelse(hibuf); + } + return; +eek: + warn("something broke"); + return; +} + +void show_freeatoms(SB) +{ + struct inode *atable = sb->atable; + atom_t atom = sb->freeatom; + while (atom) { + warn("free atom: %Lx", (L)atom); + unsigned offset; + struct buffer_head *buffer = blockread_unatom(atable, atom, &offset); + if (!buffer) + goto eek; + u64 next = from_be_u64(((be_u64 *)bufdata(buffer))[offset]); + if ((next >> 48) != 0xdead) + goto eek; + atom = next & ~(-1LL << 48); + brelse(buffer); + } + return; +eek: + warn("eek"); +} + +atom_t get_freeatom(struct inode *atable) +{ + SB = tux_sb(atable->i_sb); + atom_t atom = sb->freeatom; + if (!atom) + return sb->atomgen++; + unsigned offset; + struct buffer_head *buffer = blockread_unatom(atable, atom, &offset); + if (!buffer) + goto eek; + u64 next = from_be_u64(((be_u64 *)bufdata(buffer))[offset]); + brelse(buffer); + if ((next >> 48) != 0xdead) + goto eek; + sb->freeatom = next & ~(-1LL << 48); + return atom; +eek: + warn("something horrible happened"); + return -1; +} + +int use_atom(struct inode *atable, atom_t atom, int use) +{ + SB = tux_sb(atable->i_sb); + unsigned shift = sb->blockbits - 1; + unsigned block = sb->atomref_base + 2 * (atom >> shift); + unsigned offset = atom & ~(-1 << shift), kill = 0; + struct buffer_head *buffer; + if (!(buffer = blockread(mapping(atable), block))) + return -EIO; + int low = from_be_u16(((be_u16 *)bufdata(buffer))[offset]) + use; + trace("inc atom %x by %i, offset %x[%x], low = %i", atom, use, block, offset, low); + ((be_u16 *)bufdata(buffer))[offset] = to_be_u16(low); + if (!low || (low & (-1 << 16))) { + brelse_dirty(buffer); + if (!(buffer = blockread(mapping(atable), block + 1))) + return -EIO; + int high = from_be_u16(((be_u16 *)bufdata(buffer))[offset]) + (low >> 16); + trace("carry %i, offset %x[%x], high = %i", low >> 16, block, offset, high); + ((be_u16 *)bufdata(buffer))[offset] = to_be_u16(high); + kill = !(low | high); + } + brelse_dirty(buffer); + if (kill) { + warn("delete atom %Lx", (L) atom); + buffer = blockread_unatom(atable, atom, &offset); + if (!buffer) + return -1; // better set a flag that unatom broke or something!!! + u64 where = from_be_u64(((be_u64 *)bufdata(buffer))[offset]); + brelse(buffer); + ((be_u64 *)bufdata(buffer))[offset] = to_be_u64((u64)sb->freeatom | (0xdeadLL << 48)); + sb->freeatom = atom; + buffer = blockread(mapping(atable), where >> sb->blockbits); + tux_dirent *entry = bufdata(buffer) + (where & sb->blockmask); + if (entry_atom(entry) == atom) + tux_delete_entry(buffer, entry); + else { + warn("atom entry not found"); + brelse(buffer); + } + } + return 0; +} + +atom_t find_atom(struct inode *atable, char *name, unsigned len) +{ + struct buffer_head *buffer; + tux_dirent *entry = tux_find_entry(atable, name, len, &buffer); + if (!entry) + return -1; + atom_t atom = entry_atom(entry); + brelse(buffer); + return atom; +} + +atom_t make_atom(struct inode *atable, char *name, unsigned len) +{ + atom_t atom = find_atom(atable, name, len); + if (atom != -1) + return atom; + atom = get_freeatom(atable); + loff_t where = tux_create_entry(atable, name, len, atom, 0); + if (where < 0) + return -1; // and what about the err??? + + /* Enter into reverse map - maybe verify zero refs? */ + unsigned offset; + struct buffer_head *buffer = blockread_unatom(atable, atom, &offset); + if (!buffer) + return -1; // better set a flag that unatom broke or something!!! + ((be_u64 *)bufdata(buffer))[offset] = to_be_u64(where); + brelse_dirty(buffer); + + return atom; +} + +/* Xattr cache */ + +int xcache_dump(struct inode *inode) +{ + if (!tux_inode(inode)->xcache) + return 0; + //warn("xattrs %p/%i", inode->xcache, inode->xcache->size); + struct xattr *xattr = tux_inode(inode)->xcache->xattrs; + struct xattr *limit = xcache_limit(tux_inode(inode)->xcache); + while (xattr < limit) { + if (!xattr->size) + goto zero; + if (xattr->size > tux_sb(inode->i_sb)->blocksize) + goto barf; + printf("atom %.3x => ", xattr->atom); + hexdump(xattr->body, xattr->size); + struct xattr *xnext = xcache_next(xattr); + if (xnext > limit) + goto over; + xattr = xnext; + } + assert(xattr == limit); + return 0; +zero: + error("zero length xattr"); +over: + error("corrupt xattrs"); +barf: + error("xattr too big"); + return -1; +} + +struct xattr *xcache_lookup(struct inode *inode, unsigned atom, int *err) +{ + if (!tux_inode(inode)->xcache) + return NULL; + struct xattr *xattr = tux_inode(inode)->xcache->xattrs; + struct xattr *limit = xcache_limit(tux_inode(inode)->xcache); + while (xattr < limit) { + if (!xattr->size) + goto zero; + if (xattr->atom == atom) + return xattr; + struct xattr *xnext = xcache_next(xattr); + if (xnext > limit) + goto over; + xattr = xnext; + } + assert(xattr == limit); +null: + return NULL; +zero: + *err = EINVAL; + error("zero length xattr"); + goto null; +over: + *err = EINVAL; + error("corrupt xattrs"); + goto null; +} + +struct xcache *new_xcache(unsigned maxsize) +{ + warn("realloc xcache to %i", maxsize); + struct xcache *xcache = malloc(maxsize); + if (!xcache) + return NULL; + *xcache = (struct xcache){ .size = offsetof(struct xcache, xattrs), .maxsize = maxsize }; + return xcache; +} + +/* + * Things to improve about xcache_update: + * + * * It always allocates the new attribute at the end of the list because it + * is lazy and works by always deleting the attribute first then putting + * the new one at the end + * + * * If the size of the attribute did not change, does unecessary work + * + * * Should expand by binary factor + */ +int xcache_update(struct inode *inode, unsigned atom, void *data, unsigned len) +{ + int err = 0, use = 0; + struct xattr *xattr = xcache_lookup(inode, atom, &err); + if (xattr) { + unsigned size = (void *)xcache_next(xattr) - (void *)xattr; + //warn("size = %i\n", size); + memmove(xattr, xcache_next(xattr), tux_inode(inode)->xcache->size -= size); + use--; + } + if (len) { + unsigned more = sizeof(*xattr) + len; + struct xcache *xcache = tux_inode(inode)->xcache; + if (!xcache || xcache->size + more > xcache->maxsize) { + unsigned oldsize = xcache ? xcache->size : offsetof(struct xcache, xattrs); + unsigned maxsize = xcache ? xcache->maxsize : (1 << 7); + unsigned newsize = oldsize + (more < maxsize ? maxsize : more); + struct xcache *newcache = new_xcache(newsize); + if (!newcache) + return -ENOMEM; + if (xcache) { + memcpy(newcache->xattrs, xcache->xattrs, oldsize - offsetof(struct xcache, xattrs)); + newcache->size = oldsize; + free(xcache); + } + tux_inode(inode)->xcache = newcache; + } + xattr = xcache_limit(tux_inode(inode)->xcache); + //warn("expand by %i\n", more); + tux_inode(inode)->xcache->size += more; + memcpy(xattr->body, data, (xattr->size = len)); + xattr->atom = atom; + use++; + } + if (use) + use_atom(tux_sb(inode->i_sb)->atable, atom, use); + return 0; +} + +struct xattr *get_xattr(struct inode *inode, char *name, unsigned len) +{ + int err = 0; + atom_t atom = find_atom(tux_sb(inode->i_sb)->atable, name, len); + if (atom == -1) + return NULL; + return xcache_lookup(inode, atom, &err); // and what about the err??? +} + +int set_xattr(struct inode *inode, char *name, unsigned len, void *data, unsigned size) +{ + atom_t atom = make_atom(tux_sb(inode->i_sb)->atable, name, len); + if (atom == -1) + return -ENOENT; + return xcache_update(inode, atom, data, size); +} + +/* Xattr encode/decode */ + +void *encode_xattrs(struct inode *inode, void *attrs, unsigned size) +{ + if (!tux_inode(inode)->xcache) + return attrs; + struct xattr *xattr = tux_inode(inode)->xcache->xattrs; + struct xattr *xtop = xcache_limit(tux_inode(inode)->xcache); + void *limit = attrs + size - 3; + while (xattr < xtop) { + if (attrs >= limit) + break; + //immediate xattr: kind+version:16, bytes:16, atom:16, data[bytes - 2] + //printf("xattr %x/%x ", xattr->atom, xattr->size); + attrs = encode_kind(attrs, XATTR_ATTR, tux_sb(inode->i_sb)->version); + attrs = encode16(attrs, xattr->size + 2); + attrs = encode16(attrs, xattr->atom); + memcpy(attrs, xattr->body, xattr->size); + attrs += xattr->size; + xattr = xcache_next(xattr); + } + return attrs; +} + +unsigned decode_xsize(struct inode *inode, void *attrs, unsigned size) +{ + SB = tux_sb(inode->i_sb); + unsigned total = 0, bytes; + void *limit = attrs + size; + while (attrs < limit - 1) { + unsigned head, kind; + attrs = decode16(attrs, &head); + switch ((kind = head >> 12)) { + case XATTR_ATTR: + case IDATA_ATTR: + // immediate data: kind+version:16, bytes:16, data[bytes] + // immediate xattr: kind+version:16, bytes:16, atom:16, data[bytes - 2] + attrs = decode16(attrs, &bytes); + attrs += bytes; + if ((head & 0xfff) == sb->version) + total += sizeof(struct xattr) + bytes - 2; + continue; + } + attrs += atsize[kind]; + } + return total + sizeof(struct xcache); +} + +unsigned encode_xsize(struct inode *inode) +{ + if (!tux_inode(inode)->xcache) + return 0; + unsigned size = 0, xatsize = atsize[XATTR_ATTR]; + struct xattr *xattr = tux_inode(inode)->xcache->xattrs; + struct xattr *limit = xcache_limit(tux_inode(inode)->xcache); + while (xattr < limit) { + size += 2 + xatsize + xattr->size; + xattr = xcache_next(xattr); + } + assert(xattr == limit); + return size; +} diff --git a/include/linux/tux3.h b/include/linux/tux3.h new file mode 100644 index 0000000..37d1dcb --- /dev/null +++ b/include/linux/tux3.h @@ -0,0 +1,4 @@ +#ifndef LINUX_TUX3_H +#define LINUX_TUX3_H + +#endif