| Kernel v2.5.45 /fs/direct-io.c |
|---|
 2.5.45
 fs
 direct-io.c
diff -Nru a/fs/direct-io.c b/fs/direct-io.c
--- a/fs/direct-io.c Wed Oct 30 16:44:29 2002
+++ b/fs/direct-io.c Wed Oct 30 16:44:29 2002
@@ -29,30 +29,67 @@
*/
#define DIO_PAGES 64
+/*
+ * This code generally works in units of "dio_blocks". A dio_block is
+ * somewhere between the hard sector size and the filesystem block size. it
+ * is determined on a per-invokation basis. When talking to the filesystem
+ * we need to convert dio_blocks to fs_blocks by scaling the dio_block quantity
+ * down by dio->blkfactor. Similarly, fs-blocksize quantities are converted
+ * to bio_block quantities by shifting left by blkfactor.
+ *
+ * If blkfactor is zero then the user's request was aligned to the filesystem's
+ * blocksize.
+ */
+
struct dio {
/* BIO submission state */
struct bio *bio; /* bio under assembly */
struct inode *inode;
int rw;
unsigned blkbits; /* doesn't change */
- sector_t block_in_file; /* changes */
+ unsigned blkfactor; /* When we're using an aligment which
+ is finer than the filesystem's soft
+ blocksize, this specifies how much
+ finer. blkfactor=2 means 1/4-block
+ alignment. Does not change */
+ unsigned start_zero_done; /* flag: sub-blocksize zeroing has
+ been performed at the start of a
+ write */
+ int pages_in_io; /* approximate total IO pages */
+ sector_t block_in_file; /* Current offset into the underlying
+ file in dio_block units. */
unsigned blocks_available; /* At block_in_file. changes */
sector_t final_block_in_request;/* doesn't change */
unsigned first_block_in_page; /* doesn't change, Used only once */
int boundary; /* prev block is at a boundary */
int reap_counter; /* rate limit reaping */
get_blocks_t *get_blocks; /* block mapping function */
- sector_t last_block_in_bio; /* current final block in bio */
- sector_t next_block_in_bio; /* next block to be added to bio */
+ sector_t final_block_in_bio; /* current final block in bio + 1 */
+ sector_t next_block_for_io; /* next block to be put under IO,
+ in dio_blocks units */
struct buffer_head map_bh; /* last get_blocks() result */
- /* Page fetching state */
+ /*
+ * Deferred addition of a page to the dio. These variables are
+ * private to dio_send_cur_page(), submit_page_section() and
+ * dio_bio_add_page().
+ */
+ struct page *cur_page; /* The page */
+ unsigned cur_page_offset; /* Offset into it, in bytes */
+ unsigned cur_page_len; /* Nr of bytes at cur_page_offset */
+ sector_t cur_page_block; /* Where it starts */
+
+ /*
+ * Page fetching state. These variables belong to dio_refill_pages().
+ */
int curr_page; /* changes */
int total_pages; /* doesn't change */
- int pages_left; /* approximate total IO pages */
unsigned long curr_user_address;/* changes */
- /* Page queue */
+ /*
+ * Page queue. These variables belong to dio_refill_pages() and
+ * dio_get_page().
+ */
struct page *pages[DIO_PAGES]; /* page buffer */
unsigned head; /* next page to process */
unsigned tail; /* last valid page + 1 */
@@ -318,74 +355,41 @@
*
* In the case of filesystem holes: the fs may return an arbitrarily-large
* hole by returning an appropriate value in b_size and by clearing
- * buffer_mapped(). This code _should_ handle that case correctly, but it has
- * only been tested against single-block holes (b_size == blocksize).
+ * buffer_mapped(). However the direct-io code will only process holes one
+ * block at a time - it will repeatedly call get_blocks() as it walks the hole.
*/
static int get_more_blocks(struct dio *dio)
{
int ret;
struct buffer_head *map_bh = &dio->map_bh;
-
- if (dio->blocks_available)
- return 0;
+ sector_t fs_startblk; /* Into file, in filesystem-sized blocks */
+ unsigned long fs_count; /* Number of filesystem-sized blocks */
+ unsigned long dio_count;/* Number of dio_block-sized blocks */
+ unsigned long blkmask;
/*
* If there was a memory error and we've overwritten all the
* mapped blocks then we can now return that memory error
*/
- if (dio->page_errors) {
- ret = dio->page_errors;
- goto out;
- }
-
- map_bh->b_state = 0;
- map_bh->b_size = 0;
- BUG_ON(dio->block_in_file >= dio->final_block_in_request);
- ret = (*dio->get_blocks)(dio->inode, dio->block_in_file,
- dio->final_block_in_request - dio->block_in_file,
- map_bh, dio->rw == WRITE);
- if (ret)
- goto out;
+ ret = dio->page_errors;
+ if (ret == 0) {
+ map_bh->b_state = 0;
+ map_bh->b_size = 0;
+ BUG_ON(dio->block_in_file >= dio->final_block_in_request);
+ fs_startblk = dio->block_in_file >> dio->blkfactor;
+ dio_count = dio->final_block_in_request - dio->block_in_file;
+ fs_count = dio_count >> dio->blkfactor;
+ blkmask = (1 << dio->blkfactor) - 1;
+ if (dio_count & blkmask)
+ fs_count++;
- if (buffer_mapped(map_bh)) {
- BUG_ON(map_bh->b_size == 0);
- BUG_ON((map_bh->b_size & ((1 << dio->blkbits) - 1)) != 0);
-
- dio->blocks_available = map_bh->b_size >> dio->blkbits;
-
- /* blockdevs do not set buffer_new */
- if (buffer_new(map_bh)) {
- sector_t block = map_bh->b_blocknr;
- unsigned i;
-
- for (i = 0; i < dio->blocks_available; i++)
- unmap_underlying_metadata(map_bh->b_bdev,
- block++);
- }
- } else {
- BUG_ON(dio->rw != READ);
- if (dio->bio)
- dio_bio_submit(dio);
+ ret = (*dio->get_blocks)(dio->inode, fs_startblk, fs_count,
+ map_bh, dio->rw == WRITE);
}
- dio->next_block_in_bio = map_bh->b_blocknr;
-out:
return ret;
}
/*
- * Check to see if we can continue to grow the BIO. If not, then send it.
- */
-static void dio_prep_bio(struct dio *dio)
-{
- if (dio->bio == NULL)
- return;
-
- if (dio->boundary ||
- dio->last_block_in_bio != dio->next_block_in_bio - 1)
- dio_bio_submit(dio);
-}
-
-/*
* There is no bio. Make one now.
*/
static int dio_new_bio(struct dio *dio, sector_t blkno)
@@ -397,7 +401,7 @@
if (ret)
goto out;
sector = blkno << (dio->blkbits - 9);
- nr_pages = min(dio->pages_left, bio_get_nr_vecs(dio->map_bh.b_bdev));
+ nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
BUG_ON(nr_pages <= 0);
ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
dio->boundary = 0;
@@ -405,37 +409,199 @@
return ret;
}
+/*
+ * Attempt tp put the current chunk of 'cur_page' into the current BIO. If
+ * that was successful then update final_block_in_bio and take a ref against
+ * the just-added page.
+ */
+static int dio_bio_add_page(struct dio *dio)
+{
+ int ret;
-static int
-dio_bio_add_page(struct dio *dio, struct page *page,
- unsigned int bv_len, unsigned int bv_offset, sector_t blkno)
+ ret = bio_add_page(dio->bio, dio->cur_page,
+ dio->cur_page_len, dio->cur_page_offset);
+ if (ret == dio->cur_page_len) {
+ dio->pages_in_io--;
+ page_cache_get(dio->cur_page);
+ dio->final_block_in_bio = dio->cur_page_block +
+ (dio->cur_page_len >> dio->blkbits);
+ ret = 0;
+ }
+ return ret;
+}
+
+/*
+ * Put cur_page under IO. The section of cur_page which is described by
+ * cur_page_offset,cur_page_len is put into a BIO. The section of cur_page
+ * starts on-disk at cur_page_block.
+ *
+ * We take a ref against the page here (on behalf of its presence in the bio).
+ *
+ * The caller of this function is responsible for removing cur_page from the
+ * dio, and for dropping the refcount which came from that presence.
+ */
+static int dio_send_cur_page(struct dio *dio)
{
int ret = 0;
- if (bv_len == 0)
- goto out;
+ if (dio->bio) {
+ /*
+ * See whether this new request is contiguous with the old
+ */
+ if (dio->final_block_in_bio != dio->cur_page_block)
+ dio_bio_submit(dio);
+ /*
+ * Submit now if the underlying fs is about to perform a
+ * metadata read
+ */
+ if (dio->boundary)
+ dio_bio_submit(dio);
+ }
- /* Take a ref against the page each time it is placed into a BIO */
- page_cache_get(page);
- if (bio_add_page(dio->bio, page, bv_len, bv_offset)) {
+ if (dio->bio == NULL) {
+ ret = dio_new_bio(dio, dio->cur_page_block);
+ if (ret)
+ goto out;
+ }
+
+ if (dio_bio_add_page(dio) != 0) {
dio_bio_submit(dio);
- ret = dio_new_bio(dio, blkno);
+ ret = dio_new_bio(dio, dio->cur_page_block);
if (ret == 0) {
- ret = bio_add_page(dio->bio, page, bv_len, bv_offset);
+ ret = dio_bio_add_page(dio);
BUG_ON(ret != 0);
- } else {
- /* The page didn't make it into a BIO */
- page_cache_release(page);
}
}
- dio->pages_left--;
out:
return ret;
}
+/*
+ * An autonomous function to put a chunk of a page under deferred IO.
+ *
+ * The caller doesn't actually know (or care) whether this piece of page is in
+ * a BIO, or is under IO or whatever. We just take care of all possible
+ * situations here. The separation between the logic of do_direct_IO() and
+ * that of submit_page_section() is important for clarity. Please don't break.
+ *
+ * The chunk of page starts on-disk at blocknr.
+ *
+ * We perform deferred IO, by recording the last-submitted page inside our
+ * private part of the dio structure. If possible, we just expand the IO
+ * across that page here.
+ *
+ * If that doesn't work out then we put the old page into the bio and add this
+ * page to the dio instead.
+ */
+static int
+submit_page_section(struct dio *dio, struct page *page,
+ unsigned offset, unsigned len, sector_t blocknr)
+{
+ int ret = 0;
+
+ /*
+ * Can we just grow the current page's presence in the dio?
+ */
+ if ( (dio->cur_page == page) &&
+ (dio->cur_page_offset + dio->cur_page_len == offset) &&
+ (dio->cur_page_block +
+ (dio->cur_page_len >> dio->blkbits) == blocknr)) {
+ dio->cur_page_len += len;
+
+ /*
+ * If dio->boundary then we want to schedule the IO now to
+ * avoid metadata seeks.
+ */
+ if (dio->boundary) {
+ ret = dio_send_cur_page(dio);
+ page_cache_release(dio->cur_page);
+ dio->cur_page = NULL;
+ }
+ goto out;
+ }
+
+ /*
+ * If there's a deferred page already there then send it.
+ */
+ if (dio->cur_page) {
+ ret = dio_send_cur_page(dio);
+ page_cache_release(dio->cur_page);
+ dio->cur_page = NULL;
+ if (ret)
+ goto out;
+ }
+
+ page_cache_get(page); /* It is in dio */
+ dio->cur_page = page;
+ dio->cur_page_offset = offset;
+ dio->cur_page_len = len;
+ dio->cur_page_block = blocknr;
+out:
+ return ret;
+}
+
+/*
+ * Clean any dirty buffers in the blockdev mapping which alias newly-created
+ * file blocks. Only called for S_ISREG files - blockdevs do not set
+ * buffer_new
+ */
+static void clean_blockdev_aliases(struct dio *dio)
+{
+ unsigned i;
+
+ for (i = 0; i < dio->blocks_available; i++) {
+ unmap_underlying_metadata(dio->map_bh.b_bdev,
+ dio->map_bh.b_blocknr + i);
+ }
+}
+
+/*
+ * If we are not writing the entire block and get_block() allocated
+ * the block for us, we need to fill-in the unused portion of the
+ * block with zeros. This happens only if user-buffer, fileoffset or
+ * io length is not filesystem block-size multiple.
+ *
+ * `end' is zero if we're doing the start of the IO, 1 at the end of the
+ * IO.
+ */
+static void dio_zero_block(struct dio *dio, int end)
+{
+ unsigned dio_blocks_per_fs_block;
+ unsigned this_chunk_blocks; /* In dio_blocks */
+ unsigned this_chunk_bytes;
+ struct page *page;
+
+ dio->start_zero_done = 1;
+ if (!dio->blkfactor || !buffer_new(&dio->map_bh))
+ return;
+
+ dio_blocks_per_fs_block = 1 << dio->blkfactor;
+ this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1);
+
+ if (!this_chunk_blocks)
+ return;
+
+ /*
+ * We need to zero out part of an fs block. It is either at the
+ * beginning or the end of the fs block.
+ */
+ if (end)
+ this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
+
+ this_chunk_bytes = this_chunk_blocks << dio->blkbits;
+
+ page = ZERO_PAGE(dio->cur_user_address);
+ if (submit_page_section(dio, page, 0, this_chunk_bytes,
+ dio->next_block_for_io))
+ return;
+
+ dio->next_block_for_io += this_chunk_blocks;
+}
/*
- * Walk the user pages, and the file, mapping blocks to disk and emitting BIOs.
+ * Walk the user pages, and the file, mapping blocks to disk and generating
+ * a sequence of (page,offset,len,block) mappings. These mappings are injected
+ * into submit_page_section(), which takes care of the next stage of submission
*
* Direct IO against a blockdev is different from a file. Because we can
* happily perform page-sized but 512-byte aligned IOs. It is important that
@@ -448,73 +614,101 @@
* it should set b_size to PAGE_SIZE or more inside get_blocks(). This gives
* fine alignment but still allows this function to work in PAGE_SIZE units.
*/
-int do_direct_IO(struct dio *dio)
+static int do_direct_IO(struct dio *dio)
{
const unsigned blkbits = dio->blkbits;
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
struct page *page;
unsigned block_in_page;
+ struct buffer_head *map_bh = &dio->map_bh;
int ret = 0;
/* The I/O can start at any block offset within the first page */
block_in_page = dio->first_block_in_page;
while (dio->block_in_file < dio->final_block_in_request) {
- int new_page; /* Need to insert this page into the BIO? */
- unsigned int bv_offset;
- unsigned int bv_len;
- sector_t curr_blkno;
-
page = dio_get_page(dio);
if (IS_ERR(page)) {
ret = PTR_ERR(page);
goto out;
}
- new_page = 1;
- bv_offset = 0;
- bv_len = 0;
- curr_blkno = 0;
while (block_in_page < blocks_per_page) {
+ unsigned offset_in_page = block_in_page << blkbits;
unsigned this_chunk_bytes; /* # of bytes mapped */
unsigned this_chunk_blocks; /* # of blocks */
unsigned u;
- ret = get_more_blocks(dio);
- if (ret)
- goto fail_release;
-
+ if (dio->blocks_available == 0) {
+ /*
+ * Need to go and map some more disk
+ */
+ unsigned long blkmask;
+ unsigned long dio_remainder;
+
+ ret = get_more_blocks(dio);
+ if (ret) {
+ page_cache_release(page);
+ goto out;
+ }
+ if (!buffer_mapped(map_bh))
+ goto do_holes;
+
+ dio->blocks_available =
+ map_bh->b_size >> dio->blkbits;
+ dio->next_block_for_io =
+ map_bh->b_blocknr << dio->blkfactor;
+ if (buffer_new(map_bh))
+ clean_blockdev_aliases(dio);
+
+ if (!dio->blkfactor)
+ goto do_holes;
+
+ blkmask = (1 << dio->blkfactor) - 1;
+ dio_remainder = (dio->block_in_file & blkmask);
+
+ /*
+ * If we are at the start of IO and that IO
+ * starts partway into a fs-block,
+ * dio_remainder will be non-zero. If the IO
+ * is a read then we can simply advance the IO
+ * cursor to the first block which is to be
+ * read. But if the IO is a write and the
+ * block was newly allocated we cannot do that;
+ * the start of the fs block must be zeroed out
+ * on-disk
+ */
+ if (!buffer_new(map_bh))
+ dio->next_block_for_io += dio_remainder;
+ dio->blocks_available -= dio_remainder;
+ }
+do_holes:
/* Handle holes */
- if (!buffer_mapped(&dio->map_bh)) {
+ if (!buffer_mapped(map_bh)) {
char *kaddr = kmap_atomic(page, KM_USER0);
memset(kaddr + (block_in_page << blkbits),
0, 1 << blkbits);
flush_dcache_page(page);
kunmap_atomic(kaddr, KM_USER0);
dio->block_in_file++;
- dio->next_block_in_bio++;
block_in_page++;
goto next_block;
}
- dio_prep_bio(dio);
- if (dio->bio == NULL) {
- ret = dio_new_bio(dio, dio->next_block_in_bio);
- if (ret)
- goto fail_release;
- new_page = 1;
- }
-
- if (new_page) {
- bv_len = 0;
- bv_offset = block_in_page << blkbits;
- curr_blkno = dio->next_block_in_bio;
- new_page = 0;
- }
-
- /* Work out how much disk we can add to this page */
+ /*
+ * If we're performing IO which has an alignment which
+ * is finer than the underlying fs, go check to see if
+ * we must zero out the start of this block.
+ */
+ if (unlikely(dio->blkfactor && !dio->start_zero_done))
+ dio_zero_block(dio, 0);
+
+ /*
+ * Work out, in this_chunk_blocks, how much disk we
+ * can add to this page
+ */
this_chunk_blocks = dio->blocks_available;
- u = (PAGE_SIZE - (bv_len + bv_offset)) >> blkbits;
+ u = (PAGE_SIZE - offset_in_page) >> blkbits;
if (this_chunk_blocks > u)
this_chunk_blocks = u;
u = dio->final_block_in_request - dio->block_in_file;
@@ -523,10 +717,15 @@
this_chunk_bytes = this_chunk_blocks << blkbits;
BUG_ON(this_chunk_bytes == 0);
- bv_len += this_chunk_bytes;
- dio->next_block_in_bio += this_chunk_blocks;
- dio->last_block_in_bio = dio->next_block_in_bio - 1;
- dio->boundary = buffer_boundary(&dio->map_bh);
+ dio->boundary = buffer_boundary(map_bh);
+ ret = submit_page_section(dio, page, offset_in_page,
+ this_chunk_bytes, dio->next_block_for_io);
+ if (ret) {
+ page_cache_release(page);
+ goto out;
+ }
+ dio->next_block_for_io += this_chunk_blocks;
+
dio->block_in_file += this_chunk_blocks;
block_in_page += this_chunk_blocks;
dio->blocks_available -= this_chunk_blocks;
@@ -536,27 +735,20 @@
if (dio->block_in_file == dio->final_block_in_request)
break;
}
- ret = dio_bio_add_page(dio, page, bv_len,
- bv_offset, curr_blkno);
- if (ret)
- goto fail_release;
/* Drop the ref which was taken in get_user_pages() */
page_cache_release(page);
block_in_page = 0;
}
- goto out;
-fail_release:
- page_cache_release(page);
out:
return ret;
}
-int
+static int
direct_io_worker(int rw, struct inode *inode, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks)
+ loff_t offset, unsigned long nr_segs, unsigned blkbits,
+ get_blocks_t get_blocks)
{
- const unsigned blkbits = inode->i_blkbits;
unsigned long user_addr;
int seg, ret2, ret = 0;
struct dio dio;
@@ -566,14 +758,18 @@
dio.inode = inode;
dio.rw = rw;
dio.blkbits = blkbits;
+ dio.blkfactor = inode->i_blkbits - blkbits;
+ dio.start_zero_done = 0;
dio.block_in_file = offset >> blkbits;
dio.blocks_available = 0;
+ dio.cur_page = NULL;
+
dio.boundary = 0;
dio.reap_counter = 0;
dio.get_blocks = get_blocks;
- dio.last_block_in_bio = -1;
- dio.next_block_in_bio = -1;
+ dio.final_block_in_bio = -1;
+ dio.next_block_for_io = -1;
dio.page_errors = 0;
@@ -582,10 +778,10 @@
spin_lock_init(&dio.bio_list_lock);
dio.bio_list = NULL;
dio.waiter = NULL;
- dio.pages_left = 0;
+ dio.pages_in_io = 0;
for (seg = 0; seg < nr_segs; seg++)
- dio.pages_left += (iov[seg].iov_len / PAGE_SIZE) + 2;
+ dio.pages_in_io += (iov[seg].iov_len >> blkbits) + 2;
for (seg = 0; seg < nr_segs; seg++) {
user_addr = (unsigned long)iov[seg].iov_base;
@@ -619,6 +815,18 @@
} /* end iovec loop */
+ /*
+ * There may be some unwritten disk at the end of a part-written
+ * fs-block-sized block. Go zero that now.
+ */
+ dio_zero_block(&dio, 1);
+
+ if (dio.cur_page) {
+ ret2 = dio_send_cur_page(&dio);
+ page_cache_release(dio.cur_page);
+ if (ret == 0)
+ ret = ret2;
+ }
ret2 = dio_await_completion(&dio);
if (ret == 0)
ret = ret2;
@@ -634,27 +842,44 @@
* This is a library function for use by filesystem drivers.
*/
int
-generic_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks)
+generic_direct_IO(int rw, struct inode *inode, struct block_device *bdev,
+ const struct iovec *iov, loff_t offset, unsigned long nr_segs,
+ get_blocks_t get_blocks)
{
int seg;
size_t size;
unsigned long addr;
- unsigned blocksize_mask = (1 << inode->i_blkbits) - 1;
+ unsigned blkbits = inode->i_blkbits;
+ unsigned bdev_blkbits = 0;
+ unsigned blocksize_mask = (1 << blkbits) - 1;
ssize_t retval = -EINVAL;
- if (offset & blocksize_mask)
- goto out;
+ if (bdev)
+ bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
+
+ if (offset & blocksize_mask) {
+ if (bdev)
+ blkbits = bdev_blkbits;
+ blocksize_mask = (1 << blkbits) - 1;
+ if (offset & blocksize_mask)
+ goto out;
+ }
/* Check the memory alignment. Blocks cannot straddle pages */
for (seg = 0; seg < nr_segs; seg++) {
addr = (unsigned long)iov[seg].iov_base;
size = iov[seg].iov_len;
- if ((addr & blocksize_mask) || (size & blocksize_mask))
- goto out;
+ if ((addr & blocksize_mask) || (size & blocksize_mask)) {
+ if (bdev)
+ blkbits = bdev_blkbits;
+ blocksize_mask = (1 << blkbits) - 1;
+ if ((addr & blocksize_mask) || (size & blocksize_mask))
+ goto out;
+ }
}
- retval = direct_io_worker(rw, inode, iov, offset, nr_segs, get_blocks);
+ retval = direct_io_worker(rw, inode, iov, offset,
+ nr_segs, blkbits, get_blocks);
out:
return retval;
}
@@ -675,7 +900,7 @@
}
retval = mapping->a_ops->direct_IO(rw, file, iov, offset, nr_segs);
- if (mapping->nrpages)
+ if (rw == WRITE && mapping->nrpages)
invalidate_inode_pages2(mapping);
out:
return retval;
|