| Kernel v2.4.13-ac8 /fs/buffer.c |
|---|
 2.4.13-ac8
 fs
 buffer.c
diff -u --new-file --recursive --exclude-from /usr/src/exclude linux.vanilla/fs/buffer.c linux.ac/fs/buffer.c
--- linux.vanilla/fs/buffer.c Thu Oct 25 16:26:38 2001
+++ linux.ac/fs/buffer.c Mon Oct 15 09:12:15 2001
@@ -46,19 +46,30 @@
#include <linux/iobuf.h>
#include <linux/highmem.h>
#include <linux/completion.h>
+#include <linux/jbd.h>
+#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/bitops.h>
#include <asm/mmu_context.h>
+#define NR_SIZES 7
+static char buffersize_index[65] =
+{-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
+ 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
+ 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
+ 6};
+
+#define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
#define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
#define NR_RESERVED (10*MAX_BUF_PER_PAGE)
#define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
number of unused buffer heads */
/* Anti-deadlock ordering:
- * lru_list_lock > hash_table_lock > unused_list_lock
+ * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
*/
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
@@ -81,7 +92,13 @@
static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
-static int grow_buffers(kdev_t dev, unsigned long block, int size);
+struct bh_free_head {
+ struct buffer_head *list;
+ spinlock_t lock;
+};
+static struct bh_free_head free_list[NR_SIZES];
+
+static int grow_buffers(int size);
static void __refile_buffer(struct buffer_head *);
/* This is used by some architectures to estimate available memory. */
@@ -99,29 +116,29 @@
*/
union bdflush_param {
struct {
- int nfract; /* Percentage of buffer cache dirty to
- activate bdflush */
- int dummy1; /* old "ndirty" */
- int dummy2; /* old "nrefill" */
- int dummy3; /* unused */
- int interval; /* jiffies delay between kupdate flushes */
- int age_buffer; /* Time for normal buffer to age before we flush it */
- int nfract_sync;/* Percentage of buffer cache dirty to
- activate bdflush synchronously */
- int dummy4; /* unused */
- int dummy5; /* unused */
+ int nfract; /* Percentage of buffer cache dirty to
+ activate bdflush */
+ int ndirty; /* Maximum number of dirty blocks to write out per
+ wake-cycle */
+ int nrefill; /* Number of clean buffers to try to obtain
+ each time we call refill */
+ int dummy1; /* unused */
+ int interval; /* jiffies delay between kupdate flushes */
+ int age_buffer; /* Time for normal buffer to age before we flush it */
+ int nfract_sync; /* Percentage of buffer cache dirty to
+ activate bdflush synchronously */
+ int dummy2; /* unused */
+ int dummy3; /* unused */
} b_un;
unsigned int data[N_PARAM];
-} bdf_prm = {{40, 0, 0, 0, 5*HZ, 30*HZ, 60, 0, 0}};
+} bdf_prm = {{30, 64, 64, 256, 5*HZ, 30*HZ, 60, 0, 0}};
/* These are the min and max parameter values that we will allow to be assigned */
int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 0, 0, 0};
-int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 6000*HZ, 100, 0, 0};
+int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 100, 0, 0};
-void unlock_buffer(struct buffer_head *bh)
+inline void unlock_buffer(struct buffer_head *bh)
{
- clear_bit(BH_Wait_IO, &bh->b_state);
- clear_bit(BH_launder, &bh->b_state);
clear_bit(BH_Lock, &bh->b_state);
smp_mb__after_clear_bit();
if (waitqueue_active(&bh->b_wait))
@@ -170,32 +187,36 @@
/*
* The buffers have been marked clean and locked. Just submit the dang
* things..
+ *
+ * We'll wait for the first one of them - "sync" is not exactly
+ * performance-critical, and this makes us not hog the IO subsystem
+ * completely, while still allowing for a fair amount of concurrent IO.
*/
static void write_locked_buffers(struct buffer_head **array, unsigned int count)
{
+ struct buffer_head *wait = *array;
+ get_bh(wait);
do {
struct buffer_head * bh = *array++;
bh->b_end_io = end_buffer_io_sync;
submit_bh(WRITE, bh);
} while (--count);
+ wait_on_buffer(wait);
+ put_bh(wait);
}
-/*
- * Write some buffers from the head of the dirty queue.
- *
- * This must be called with the LRU lock held, and will
- * return without it!
- */
#define NRSYNC (32)
-static int write_some_buffers(kdev_t dev)
+static void write_unlocked_buffers(kdev_t dev)
{
struct buffer_head *next;
struct buffer_head *array[NRSYNC];
unsigned int count;
int nr;
+repeat:
+ spin_lock(&lru_list_lock);
next = lru_list[BUF_DIRTY];
- nr = nr_buffers_type[BUF_DIRTY];
+ nr = nr_buffers_type[BUF_DIRTY] * 2;
count = 0;
while (next && --nr >= 0) {
struct buffer_head * bh = next;
@@ -205,51 +226,35 @@
continue;
if (test_and_set_bit(BH_Lock, &bh->b_state))
continue;
+ get_bh(bh);
if (atomic_set_buffer_clean(bh)) {
__refile_buffer(bh);
- get_bh(bh);
array[count++] = bh;
if (count < NRSYNC)
continue;
spin_unlock(&lru_list_lock);
write_locked_buffers(array, count);
- return -EAGAIN;
+ goto repeat;
}
unlock_buffer(bh);
- __refile_buffer(bh);
+ put_bh(bh);
}
spin_unlock(&lru_list_lock);
if (count)
write_locked_buffers(array, count);
- return 0;
-}
-
-/*
- * Write out all buffers on the dirty list.
- */
-static void write_unlocked_buffers(kdev_t dev)
-{
- do {
- spin_lock(&lru_list_lock);
- } while (write_some_buffers(dev));
- run_task_queue(&tq_disk);
}
-/*
- * Wait for a buffer on the proper list.
- *
- * This must be called with the LRU lock held, and
- * will return with it released.
- */
-static int wait_for_buffers(kdev_t dev, int index, int refile)
+static int wait_for_locked_buffers(kdev_t dev, int index, int refile)
{
struct buffer_head * next;
int nr;
+repeat:
+ spin_lock(&lru_list_lock);
next = lru_list[index];
- nr = nr_buffers_type[index];
+ nr = nr_buffers_type[index] * 2;
while (next && --nr >= 0) {
struct buffer_head *bh = next;
next = bh->b_next_free;
@@ -266,26 +271,12 @@
spin_unlock(&lru_list_lock);
wait_on_buffer (bh);
put_bh(bh);
- return -EAGAIN;
+ goto repeat;
}
spin_unlock(&lru_list_lock);
return 0;
}
-static inline void wait_for_some_buffers(kdev_t dev)
-{
- spin_lock(&lru_list_lock);
- wait_for_buffers(dev, BUF_LOCKED, 1);
-}
-
-static int wait_for_locked_buffers(kdev_t dev, int index, int refile)
-{
- do {
- spin_lock(&lru_list_lock);
- } while (wait_for_buffers(dev, index, refile));
- return 0;
-}
-
/* Call sync_buffers with wait!=0 to ensure that the call does not
* return until all buffer writes have completed. Sync() may return
* before the writes have finished; fsync() may not.
@@ -296,7 +287,7 @@
* We will ultimately want to put these in a separate list, but for
* now we search all of the lists for dirty buffers.
*/
-int sync_buffers(kdev_t dev, int wait)
+static int sync_buffers(kdev_t dev, int wait)
{
int err = 0;
@@ -467,26 +458,20 @@
((block) << (bh_hash_shift - 12))))
#define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
-static inline void __insert_into_hash_list(struct buffer_head *bh)
+static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
{
- struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
- struct buffer_head *next = *head;
-
+ if ((bh->b_next = *head) != NULL)
+ bh->b_next->b_pprev = &bh->b_next;
*head = bh;
bh->b_pprev = head;
- bh->b_next = next;
- if (next != NULL)
- next->b_pprev = &bh->b_next;
}
static __inline__ void __hash_unlink(struct buffer_head *bh)
{
- struct buffer_head **pprev = bh->b_pprev;
- if (pprev) {
- struct buffer_head *next = bh->b_next;
- if (next)
- next->b_pprev = pprev;
- *pprev = next;
+ if (bh->b_pprev) {
+ if (bh->b_next)
+ bh->b_next->b_pprev = bh->b_pprev;
+ *(bh->b_pprev) = bh->b_next;
bh->b_pprev = NULL;
}
}
@@ -495,8 +480,6 @@
{
struct buffer_head **bhp = &lru_list[blist];
- if (bh->b_prev_free || bh->b_next_free) BUG();
-
if(!*bhp) {
*bhp = bh;
bh->b_prev_free = bh;
@@ -509,77 +492,113 @@
size_buffers_type[blist] += bh->b_size;
}
-static void __remove_from_lru_list(struct buffer_head * bh)
+static void __remove_from_lru_list(struct buffer_head * bh, int blist)
{
- struct buffer_head *next = bh->b_next_free;
- if (next) {
- struct buffer_head *prev = bh->b_prev_free;
- int blist = bh->b_list;
-
- prev->b_next_free = next;
- next->b_prev_free = prev;
- if (lru_list[blist] == bh) {
- if (next == bh)
- next = NULL;
- lru_list[blist] = next;
- }
- bh->b_next_free = NULL;
- bh->b_prev_free = NULL;
+ if (bh->b_prev_free || bh->b_next_free) {
+ bh->b_prev_free->b_next_free = bh->b_next_free;
+ bh->b_next_free->b_prev_free = bh->b_prev_free;
+ if (lru_list[blist] == bh)
+ lru_list[blist] = bh->b_next_free;
+ if (lru_list[blist] == bh)
+ lru_list[blist] = NULL;
+ bh->b_next_free = bh->b_prev_free = NULL;
nr_buffers_type[blist]--;
size_buffers_type[blist] -= bh->b_size;
}
}
+static void __remove_from_free_list(struct buffer_head * bh, int index)
+{
+ if(bh->b_next_free == bh)
+ free_list[index].list = NULL;
+ else {
+ bh->b_prev_free->b_next_free = bh->b_next_free;
+ bh->b_next_free->b_prev_free = bh->b_prev_free;
+ if (free_list[index].list == bh)
+ free_list[index].list = bh->b_next_free;
+ }
+ bh->b_next_free = bh->b_prev_free = NULL;
+}
+
/* must be called with both the hash_table_lock and the lru_list_lock
held */
static void __remove_from_queues(struct buffer_head *bh)
{
__hash_unlink(bh);
- __remove_from_lru_list(bh);
+ __remove_from_lru_list(bh, bh->b_list);
}
-struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
+static void __insert_into_queues(struct buffer_head *bh)
{
- struct buffer_head *bh, **p = &hash(dev, block);
+ struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
- read_lock(&hash_table_lock);
+ __hash_link(bh, head);
+ __insert_into_lru_list(bh, bh->b_list);
+}
- for (;;) {
- bh = *p;
- if (!bh)
+/* This function must only run if there are no other
+ * references _anywhere_ to this buffer head.
+ */
+static void put_last_free(struct buffer_head * bh)
+{
+ struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
+ struct buffer_head **bhp = &head->list;
+
+ bh->b_state = 0;
+
+ spin_lock(&head->lock);
+ bh->b_dev = B_FREE;
+ if(!*bhp) {
+ *bhp = bh;
+ bh->b_prev_free = bh;
+ }
+ bh->b_next_free = *bhp;
+ bh->b_prev_free = (*bhp)->b_prev_free;
+ (*bhp)->b_prev_free->b_next_free = bh;
+ (*bhp)->b_prev_free = bh;
+ spin_unlock(&head->lock);
+}
+
+/*
+ * Why like this, I hear you say... The reason is race-conditions.
+ * As we don't lock buffers (unless we are reading them, that is),
+ * something might happen to it while we sleep (ie a read-error
+ * will force it bad). This shouldn't really happen currently, but
+ * the code is ready.
+ */
+static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size)
+{
+ struct buffer_head *bh = hash(dev, block);
+
+ for (; bh; bh = bh->b_next)
+ if (bh->b_blocknr == block &&
+ bh->b_size == size &&
+ bh->b_dev == dev)
break;
- p = &bh->b_next;
- if (bh->b_blocknr != block)
- continue;
- if (bh->b_size != size)
- continue;
- if (bh->b_dev != dev)
- continue;
+ if (bh)
get_bh(bh);
- break;
- }
- read_unlock(&hash_table_lock);
return bh;
}
-void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
+struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
{
- spin_lock(&lru_list_lock);
- if (bh->b_inode)
- list_del(&bh->b_inode_buffers);
- bh->b_inode = inode;
- list_add(&bh->b_inode_buffers, &inode->i_dirty_buffers);
- spin_unlock(&lru_list_lock);
+ struct buffer_head *bh;
+
+ read_lock(&hash_table_lock);
+ bh = __get_hash_table(dev, block, size);
+ read_unlock(&hash_table_lock);
+
+ return bh;
}
-void buffer_insert_inode_data_queue(struct buffer_head *bh, struct inode *inode)
+void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
{
spin_lock(&lru_list_lock);
if (bh->b_inode)
list_del(&bh->b_inode_buffers);
bh->b_inode = inode;
- list_add(&bh->b_inode_buffers, &inode->i_dirty_data_buffers);
+ list_add(&bh->b_inode_buffers, &inode->i_dirty_buffers);
spin_unlock(&lru_list_lock);
}
@@ -602,12 +621,13 @@
int ret;
spin_lock(&lru_list_lock);
- ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers);
+ ret = !list_empty(&inode->i_dirty_buffers);
spin_unlock(&lru_list_lock);
return ret;
}
+
/* If invalidate_buffers() will trash dirty buffers, it means some kind
of fs corruption is going on. Trashing dirty data always imply losing
information that was supposed to be just stored on the physical layer
@@ -627,20 +647,11 @@
These are two special cases. Normal usage imply the device driver
to issue a sync on the device (without waiting I/O completion) and
- then an invalidate_buffers call that doesn't trash dirty buffers.
-
- For handling cache coherency with the blkdev pagecache the 'update' case
- is been introduced. It is needed to re-read from disk any pinned
- buffer. NOTE: re-reading from disk is destructive so we can do it only
- when we assume nobody is changing the buffercache under our I/O and when
- we think the disk contains more recent information than the buffercache.
- The update == 1 pass marks the buffers we need to update, the update == 2
- pass does the actual I/O. */
-void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
+ then an invalidate_buffers call that doesn't trash dirty buffers. */
+void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
{
int i, nlist, slept;
struct buffer_head * bh, * bh_next;
- kdev_t dev = to_kdev_t(bdev->bd_dev); /* will become bdev */
retry:
slept = 0;
@@ -668,17 +679,13 @@
}
write_lock(&hash_table_lock);
- /* All buffers in the lru lists are mapped */
- if (!buffer_mapped(bh))
- BUG();
- if (buffer_dirty(bh))
- printk("invalidate: dirty buffer\n");
- if (!atomic_read(&bh->b_count)) {
- if (destroy_dirty_buffers || !buffer_dirty(bh)) {
- remove_inode_queue(bh);
- }
- } else
- printk("invalidate: busy buffer\n");
+ if (!atomic_read(&bh->b_count) &&
+ (destroy_dirty_buffers || !buffer_dirty(bh))) {
+ remove_inode_queue(bh);
+ __remove_from_queues(bh);
+ put_last_free(bh);
+ }
+ /* else complain loudly? */
write_unlock(&hash_table_lock);
if (slept)
@@ -689,29 +696,106 @@
spin_unlock(&lru_list_lock);
if (slept)
goto retry;
-
- /* Get rid of the page cache */
- invalidate_inode_pages(bdev->bd_inode);
}
-void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
+int set_blocksize(kdev_t dev, int size)
{
- struct block_device *bdev = bdget(dev);
- if (bdev) {
- invalidate_bdev(bdev, destroy_dirty_buffers);
- bdput(bdev);
+ extern int *blksize_size[];
+ int i, nlist, slept;
+ struct buffer_head * bh, * bh_next;
+
+ if (!blksize_size[MAJOR(dev)])
+ return 0;
+
+ /* Size must be a power of two, and between 512 and PAGE_SIZE */
+ if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
+ panic("Invalid blocksize passed to set_blocksize");
+
+ if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
+ blksize_size[MAJOR(dev)][MINOR(dev)] = size;
+ return 0;
+ }
+ if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
+ return 0;
+ sync_buffers(dev, 2);
+ blksize_size[MAJOR(dev)][MINOR(dev)] = size;
+
+ retry:
+ slept = 0;
+ spin_lock(&lru_list_lock);
+ for(nlist = 0; nlist < NR_LIST; nlist++) {
+ bh = lru_list[nlist];
+ if (!bh)
+ continue;
+ for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
+ bh_next = bh->b_next_free;
+ if (bh->b_dev != dev || bh->b_size == size)
+ continue;
+ /* Unhashed? */
+ if (!bh->b_pprev)
+ continue;
+ if (buffer_locked(bh)) {
+ get_bh(bh);
+ spin_unlock(&lru_list_lock);
+ wait_on_buffer(bh);
+ slept = 1;
+ spin_lock(&lru_list_lock);
+ put_bh(bh);
+ }
+
+ write_lock(&hash_table_lock);
+ if (!atomic_read(&bh->b_count)) {
+ if (buffer_dirty(bh))
+ printk(KERN_WARNING
+ "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
+ kdevname(dev), bh->b_blocknr, bh->b_size);
+ remove_inode_queue(bh);
+ __remove_from_queues(bh);
+ put_last_free(bh);
+ } else {
+ if (atomic_set_buffer_clean(bh))
+ __refile_buffer(bh);
+ clear_bit(BH_Uptodate, &bh->b_state);
+ printk(KERN_WARNING
+ "set_blocksize: "
+ "b_count %d, dev %s, block %lu, from %p\n",
+ atomic_read(&bh->b_count), bdevname(bh->b_dev),
+ bh->b_blocknr, __builtin_return_address(0));
+ }
+ write_unlock(&hash_table_lock);
+ if (slept)
+ goto out;
+ }
}
+ out:
+ spin_unlock(&lru_list_lock);
+ if (slept)
+ goto retry;
+ return 0;
}
static void free_more_memory(void)
{
- balance_dirty();
- wakeup_bdflush();
+ balance_dirty(NODEV);
+ page_launder(GFP_NOFS, 0);
+ wakeup_bdflush(0);
+ wakeup_kswapd();
current->policy |= SCHED_YIELD;
__set_current_state(TASK_RUNNING);
schedule();
}
+/*
+ * We used to try various strange things. Let's not.
+ * We'll just try to balance dirty buffers, and possibly
+ * launder some pages.
+ */
+static void refill_freelist(int size)
+{
+ if (!grow_buffers(size))
+ free_more_memory();
+}
+
void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
{
bh->b_list = BUF_CLEAN;
@@ -758,6 +842,7 @@
/* OK, the async IO on this page is complete. */
spin_unlock_irqrestore(&page_uptodate_lock, flags);
+ put_bh(bh);
/*
* if none of the buffers had errors then we can set the
@@ -766,17 +851,12 @@
if (!PageError(page))
SetPageUptodate(page);
- /*
- * Run the hooks that have to be done when a page I/O has completed.
- */
- if (PageTestandClearDecrAfter(page))
- atomic_dec(&nr_async_pages);
-
UnlockPage(page);
return;
still_busy:
+ put_bh(bh);
spin_unlock_irqrestore(&page_uptodate_lock, flags);
return;
}
@@ -855,54 +935,6 @@
return err2;
}
-int fsync_inode_data_buffers(struct inode *inode)
-{
- struct buffer_head *bh;
- struct inode tmp;
- int err = 0, err2;
-
- INIT_LIST_HEAD(&tmp.i_dirty_data_buffers);
-
- spin_lock(&lru_list_lock);
-
- while (!list_empty(&inode->i_dirty_data_buffers)) {
- bh = BH_ENTRY(inode->i_dirty_data_buffers.next);
- list_del(&bh->b_inode_buffers);
- if (!buffer_dirty(bh) && !buffer_locked(bh))
- bh->b_inode = NULL;
- else {
- bh->b_inode = &tmp;
- list_add(&bh->b_inode_buffers, &tmp.i_dirty_data_buffers);
- if (buffer_dirty(bh)) {
- get_bh(bh);
- spin_unlock(&lru_list_lock);
- ll_rw_block(WRITE, 1, &bh);
- brelse(bh);
- spin_lock(&lru_list_lock);
- }
- }
- }
-
- while (!list_empty(&tmp.i_dirty_data_buffers)) {
- bh = BH_ENTRY(tmp.i_dirty_data_buffers.prev);
- remove_inode_queue(bh);
- get_bh(bh);
- spin_unlock(&lru_list_lock);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh))
- err = -EIO;
- brelse(bh);
- spin_lock(&lru_list_lock);
- }
-
- spin_unlock(&lru_list_lock);
- err2 = osync_inode_data_buffers(inode);
-
- if (err)
- return err;
- else
- return err2;
-}
/*
* osync is designed to support O_SYNC io. It waits synchronously for
@@ -944,35 +976,6 @@
return err;
}
-int osync_inode_data_buffers(struct inode *inode)
-{
- struct buffer_head *bh;
- struct list_head *list;
- int err = 0;
-
- spin_lock(&lru_list_lock);
-
- repeat:
-
- for (list = inode->i_dirty_data_buffers.prev;
- bh = BH_ENTRY(list), list != &inode->i_dirty_data_buffers;
- list = bh->b_inode_buffers.prev) {
- if (buffer_locked(bh)) {
- get_bh(bh);
- spin_unlock(&lru_list_lock);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh))
- err = -EIO;
- brelse(bh);
- spin_lock(&lru_list_lock);
- goto repeat;
- }
- }
-
- spin_unlock(&lru_list_lock);
- return err;
-}
-
/*
* Invalidate any and all dirty buffers on a given inode. We are
@@ -981,13 +984,15 @@
*/
void invalidate_inode_buffers(struct inode *inode)
{
- struct list_head * entry;
+ struct list_head *list, *next;
spin_lock(&lru_list_lock);
- while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers)
- remove_inode_queue(BH_ENTRY(entry));
- while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers)
- remove_inode_queue(BH_ENTRY(entry));
+ list = inode->i_dirty_buffers.next;
+ while (list != &inode->i_dirty_buffers) {
+ next = list->next;
+ remove_inode_queue(BH_ENTRY(list));
+ list = next;
+ }
spin_unlock(&lru_list_lock);
}
@@ -1004,22 +1009,60 @@
*/
struct buffer_head * getblk(kdev_t dev, int block, int size)
{
- for (;;) {
- struct buffer_head * bh;
+ struct buffer_head * bh;
+ int isize;
- bh = get_hash_table(dev, block, size);
- if (bh)
- return bh;
+repeat:
+ spin_lock(&lru_list_lock);
+ write_lock(&hash_table_lock);
+ bh = __get_hash_table(dev, block, size);
+ if (bh)
+ goto out;
+
+ isize = BUFSIZE_INDEX(size);
+ spin_lock(&free_list[isize].lock);
+ bh = free_list[isize].list;
+ if (bh) {
+ __remove_from_free_list(bh, isize);
+ atomic_set(&bh->b_count, 1);
+ }
+ spin_unlock(&free_list[isize].lock);
- if (!grow_buffers(dev, block, size))
- free_more_memory();
+ /*
+ * OK, FINALLY we know that this buffer is the only one of
+ * its kind, we hold a reference (b_count>0), it is unlocked,
+ * and it is clean.
+ */
+ if (bh) {
+ init_buffer(bh, NULL, NULL);
+ bh->b_dev = dev;
+ bh->b_blocknr = block;
+ bh->b_state = 1 << BH_Mapped;
+
+ /* Insert the buffer into the regular lists */
+ __insert_into_queues(bh);
+ out:
+ write_unlock(&hash_table_lock);
+ spin_unlock(&lru_list_lock);
+ touch_buffer(bh);
+ return bh;
}
+
+ /*
+ * If we block while refilling the free list, somebody may
+ * create the buffer first ... search the hashes again.
+ */
+ write_unlock(&hash_table_lock);
+ spin_unlock(&lru_list_lock);
+ refill_freelist(size);
+ /* FIXME: getblk should fail if there's no enough memory */
+ goto repeat;
}
/* -1 -> no need to flush
0 -> async flush
1 -> sync flush (wait for I/O completion) */
-static int balance_dirty_state(void)
+int balance_dirty_state(kdev_t dev)
{
unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
@@ -1047,30 +1090,16 @@
* pressures on different devices - thus the (currently unused)
* 'dev' parameter.
*/
-void balance_dirty(void)
+void balance_dirty(kdev_t dev)
{
- int state = balance_dirty_state();
+ int state = balance_dirty_state(dev);
if (state < 0)
return;
-
- /* If we're getting into imbalance, start write-out */
- spin_lock(&lru_list_lock);
- write_some_buffers(NODEV);
-
- /*
- * And if we're _really_ out of balance, wait for
- * some of the dirty/locked buffers ourselves and
- * start bdflush.
- * This will throttle heavy writers.
- */
- if (state > 0) {
- wait_for_some_buffers(NODEV);
- wakeup_bdflush();
- }
+ wakeup_bdflush(state);
}
-inline void __mark_dirty(struct buffer_head *bh)
+static __inline__ void __mark_dirty(struct buffer_head *bh)
{
bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
refile_buffer(bh);
@@ -1088,10 +1117,16 @@
{
if (!atomic_set_buffer_dirty(bh)) {
__mark_dirty(bh);
- balance_dirty();
+ balance_dirty(bh->b_dev);
}
}
+void set_buffer_flushtime(struct buffer_head *bh)
+{
+ bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
+}
+EXPORT_SYMBOL(set_buffer_flushtime);
+
/*
* A buffer may need to be moved from one buffer list to another
* (e.g. in case it is not shared any more). Handle this.
@@ -1103,8 +1138,10 @@
dispose = BUF_LOCKED;
if (buffer_dirty(bh))
dispose = BUF_DIRTY;
+ if (buffer_protected(bh))
+ dispose = BUF_PROTECTED;
if (dispose != bh->b_list) {
- __remove_from_lru_list(bh);
+ __remove_from_lru_list(bh, bh->b_list);
bh->b_list = dispose;
if (dispose == BUF_CLEAN)
remove_inode_queue(bh);
@@ -1132,13 +1169,30 @@
}
/*
- * bforget() is like brelse(), except it discards any
- * potentially dirty data.
+ * bforget() is like brelse(), except it puts the buffer on the
+ * free list if it can.. We can NOT free the buffer if:
+ * - there are other users of it
+ * - it is locked and thus can have active IO
*/
void __bforget(struct buffer_head * buf)
{
- mark_buffer_clean(buf);
- __brelse(buf);
+ /* grab the lru lock here to block bdflush. */
+ spin_lock(&lru_list_lock);
+ write_lock(&hash_table_lock);
+ if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf) || buffer_protected(buf))
+ goto in_use;
+ J_ASSERT_BH(buf, !buffer_jbd(buf));
+ __hash_unlink(buf);
+ remove_inode_queue(buf);
+ write_unlock(&hash_table_lock);
+ __remove_from_lru_list(buf, buf->b_list);
+ spin_unlock(&lru_list_lock);
+ put_last_free(buf);
+ return;
+
+ in_use:
+ write_unlock(&hash_table_lock);
+ spin_unlock(&lru_list_lock);
}
/**
@@ -1167,14 +1221,23 @@
/*
* Note: the caller should wake up the buffer_wait list if needed.
*/
-static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
+static void __put_unused_buffer_head(struct buffer_head * bh)
{
if (bh->b_inode)
BUG();
+
+ J_ASSERT_BH(bh, bh->b_prev_free == 0);
+#if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+ if (buffer_jbd(bh)) {
+ J_ASSERT_BH(bh, bh2jh(bh)->b_transaction == 0);
+ J_ASSERT_BH(bh, bh2jh(bh)->b_next_transaction == 0);
+ J_ASSERT_BH(bh, bh2jh(bh)->b_frozen_data == 0);
+ J_ASSERT_BH(bh, bh2jh(bh)->b_committed_data == 0);
+ }
+#endif
if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
kmem_cache_free(bh_cachep, bh);
} else {
- bh->b_dev = B_FREE;
bh->b_blocknr = -1;
bh->b_this_page = NULL;
@@ -1184,12 +1247,20 @@
}
}
+void put_unused_buffer_head(struct buffer_head *bh)
+{
+ spin_lock(&unused_list_lock);
+ __put_unused_buffer_head(bh);
+ spin_unlock(&unused_list_lock);
+}
+EXPORT_SYMBOL(put_unused_buffer_head);
+
/*
* Reserve NR_RESERVED buffer heads for async IO requests to avoid
* no-buffer-head deadlock. Return NULL on failure; waiting for
* buffer heads is now handled in create_buffers().
*/
-static struct buffer_head * get_unused_buffer_head(int async)
+struct buffer_head * get_unused_buffer_head(int async)
{
struct buffer_head * bh;
@@ -1227,10 +1298,26 @@
}
spin_unlock(&unused_list_lock);
}
+#if 0
+ /*
+ * (Pending further analysis ...)
+ * Ordinary (non-async) requests can use a different memory priority
+ * to free up pages. Any swapping thus generated will use async
+ * buffer heads.
+ */
+ if(!async &&
+ (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
+ memset(bh, 0, sizeof(*bh));
+ init_waitqueue_head(&bh->b_wait);
+ return bh;
+ }
+#endif
return NULL;
}
+EXPORT_SYMBOL(get_unused_buffer_head);
+
void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
{
bh->b_page = page;
@@ -1245,6 +1332,8 @@
bh->b_data = page_address(page) + offset;
}
+EXPORT_SYMBOL(set_bh_page);
+
/*
* Create the appropriate buffers when given a page for data area and
* the size of each buffer.. Use the bh->b_this_page linked list to
@@ -1267,7 +1356,7 @@
if (!bh)
goto no_grow;
- bh->b_dev = NODEV;
+ bh->b_dev = B_FREE; /* Flag as unused */
bh->b_this_page = head;
head = bh;
@@ -1321,10 +1410,7 @@
goto try_again;
}
-/*
- * Called when truncating a buffer on a page completely.
- */
-static void discard_buffer(struct buffer_head * bh)
+static void unmap_buffer(struct buffer_head * bh)
{
if (buffer_mapped(bh)) {
mark_buffer_clean(bh);
@@ -1337,6 +1423,31 @@
}
}
+/**
+ * try_to_release_page - release old fs-specific metadata on a page
+ *
+ */
+
+int try_to_release_page(struct page * page, int gfp_mask)
+{
+ if (!PageLocked(page))
+ BUG();
+
+ if (!page->mapping)
+ goto try_to_free;
+ if (!page->mapping->a_ops->releasepage)
+ goto try_to_free;
+ if (page->mapping->a_ops->releasepage(page, gfp_mask))
+ goto try_to_free;
+ /*
+ * We couldn't release buffer metadata; don't even bother trying
+ * to release buffers.
+ */
+ return 0;
+try_to_free:
+ return try_to_free_buffers(page, gfp_mask);
+}
+
/*
* We don't have to release all buffers here, but
* we have to be sure that no dirty buffer is left
@@ -1344,7 +1455,7 @@
* we have truncated the file and are going to free the
* blocks on-disk..
*/
-int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
+int block_flushpage(struct page *page, unsigned long offset)
{
struct buffer_head *head, *bh, *next;
unsigned int curr_off = 0;
@@ -1364,7 +1475,7 @@
* is this block fully flushed?
*/
if (offset <= curr_off)
- discard_buffer(bh);
+ unmap_buffer(bh);
curr_off = next_off;
bh = next;
} while (bh != head);
@@ -1380,8 +1491,10 @@
* instead.
*/
if (!offset) {
- if (!try_to_free_buffers(page, 0))
+ if (!try_to_release_page(page, 0)) {
+ atomic_inc(&buffermem_pages);
return 0;
+ }
}
return 1;
@@ -1408,6 +1521,7 @@
page->buffers = head;
page_cache_get(page);
}
+EXPORT_SYMBOL(create_empty_buffers);
/*
* We are taking a block for data and we don't want any output from any
@@ -1426,11 +1540,16 @@
struct buffer_head *old_bh;
old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
+ J_ASSERT_BH(bh, old_bh != bh);
if (old_bh) {
+ J_ASSERT_BH(old_bh, !buffer_jlist_eq(old_bh, BJ_Metadata));
mark_buffer_clean(old_bh);
wait_on_buffer(old_bh);
clear_bit(BH_Req, &old_bh->b_state);
- __brelse(old_bh);
+ /* Here we could run brelse or bforget. We use
+ bforget because it will try to put the buffer
+ in the freelist. */
+ __bforget(old_bh);
}
}
@@ -1448,23 +1567,23 @@
*/
/*
- * block_write_full_page() is SMP-safe - currently it's still
- * being called with the kernel lock held, but the code is ready.
+ * block_write_full_page() is SMP threaded - the kernel lock is not held.
*/
static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
{
int err, i;
unsigned long block;
struct buffer_head *bh, *head;
+ int need_unlock = 1;
if (!PageLocked(page))
BUG();
if (!page->buffers)
- create_empty_buffers(page, inode->i_dev, 1 << inode->i_blkbits);
+ create_empty_buffers(page, inode->i_dev, inode->i_sb->s_blocksize);
head = page->buffers;
- block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
bh = head;
i = 0;
@@ -1494,11 +1613,13 @@
do {
lock_buffer(bh);
set_buffer_async_io(bh);
+ get_bh(bh);
set_bit(BH_Uptodate, &bh->b_state);
clear_bit(BH_Dirty, &bh->b_state);
bh = bh->b_this_page;
} while (bh != head);
+ SetPageUptodate(page);
/* Stage 3: submit the IO */
do {
struct buffer_head *next = bh->b_this_page;
@@ -1507,12 +1628,32 @@
} while (bh != head);
/* Done - end_buffer_io_async will unlock */
- SetPageUptodate(page);
return 0;
out:
ClearPageUptodate(page);
- UnlockPage(page);
+ bh = head;
+ need_unlock = 1;
+ /* Recovery: lock and submit the mapped buffers */
+ do {
+ if (buffer_mapped(bh)) {
+ lock_buffer(bh);
+ need_unlock = 0;
+ }
+ bh = bh->b_this_page;
+ } while (bh != head);
+ do {
+ if (buffer_mapped(bh)) {
+ bh->b_end_io = end_buffer_io_async;
+ get_bh(bh);
+ set_bit(BH_Uptodate, &bh->b_state);
+ clear_bit(BH_Dirty, &bh->b_state);
+ submit_bh(WRITE, bh);
+ }
+ bh = bh->b_this_page;
+ } while(bh != head);
+ if (need_unlock)
+ UnlockPage(page);
return err;
}
@@ -1526,12 +1667,12 @@
struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
char *kaddr = kmap(page);
- blocksize = 1 << inode->i_blkbits;
+ blocksize = inode->i_sb->s_blocksize;
if (!page->buffers)
create_empty_buffers(page, inode->i_dev, blocksize);
head = page->buffers;
- bbits = inode->i_blkbits;
+ bbits = inode->i_sb->s_blocksize_bits;
block = page->index << (PAGE_CACHE_SHIFT - bbits);
for(bh = head, block_start = 0; bh != head || !block_start;
@@ -1583,6 +1724,17 @@
}
return 0;
out:
+ bh = head;
+ block_start = 0;
+ do {
+ if (buffer_new(bh) && !buffer_uptodate(bh)) {
+ memset(kaddr+block_start, 0, bh->b_size);
+ set_bit(BH_Uptodate, &bh->b_state);
+ mark_buffer_dirty(bh);
+ }
+ block_start += bh->b_size;
+ bh = bh->b_this_page;
+ } while (bh != head);
return err;
}
@@ -1594,7 +1746,7 @@
unsigned blocksize;
struct buffer_head *bh, *head;
- blocksize = 1 << inode->i_blkbits;
+ blocksize = inode->i_sb->s_blocksize;
for(bh = head = page->buffers, block_start = 0;
bh != head || !block_start;
@@ -1607,14 +1759,14 @@
set_bit(BH_Uptodate, &bh->b_state);
if (!atomic_set_buffer_dirty(bh)) {
__mark_dirty(bh);
- buffer_insert_inode_data_queue(bh, inode);
+ buffer_insert_inode_queue(bh, inode);
need_balance_dirty = 1;
}
}
}
if (need_balance_dirty)
- balance_dirty();
+ balance_dirty(bh->b_dev);
/*
* is this a partial write that happened to make all buffers
* uptodate then we can optimize away a bogus readpage() for
@@ -1643,14 +1795,14 @@
if (!PageLocked(page))
PAGE_BUG(page);
- blocksize = 1 << inode->i_blkbits;
+ blocksize = inode->i_sb->s_blocksize;
if (!page->buffers)
create_empty_buffers(page, inode->i_dev, blocksize);
head = page->buffers;
- blocks = PAGE_CACHE_SIZE >> inode->i_blkbits;
- iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- lblock = (inode->i_size+blocksize-1) >> inode->i_blkbits;
+ blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
+ iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+ lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
bh = head;
nr = 0;
i = 0;
@@ -1695,6 +1847,7 @@
struct buffer_head * bh = arr[i];
lock_buffer(bh);
set_buffer_async_io(bh);
+ get_bh(bh);
}
/* Stage 3: start the IO */
@@ -1717,7 +1870,7 @@
unsigned long pgpos;
long status;
unsigned zerofrom;
- unsigned blocksize = 1 << inode->i_blkbits;
+ unsigned blocksize = inode->i_sb->s_blocksize;
char *kaddr;
while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
@@ -1802,14 +1955,6 @@
return err;
}
-int block_commit_write(struct page *page, unsigned from, unsigned to)
-{
- struct inode *inode = page->mapping->host;
- __block_commit_write(inode,page,from,to);
- kunmap(page);
- return 0;
-}
-
int generic_commit_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
@@ -1834,7 +1979,7 @@
struct buffer_head *bh;
int err;
- blocksize = 1 << inode->i_blkbits;
+ blocksize = inode->i_sb->s_blocksize;
length = offset & (blocksize - 1);
/* Block boundary? Nothing to do */
@@ -1842,7 +1987,7 @@
return 0;
length = blocksize - length;
- iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
page = grab_cache_page(mapping, index);
err = -ENOMEM;
@@ -2058,20 +2203,9 @@
while (length > 0) {
blocknr = b[bufind++];
- if (blocknr == -1UL) {
- if (rw == READ) {
- /* there was an hole in the filesystem */
- memset(kmap(map) + offset, 0, size);
- flush_dcache_page(map);
- kunmap(map);
-
- transferred += size;
- goto skip_block;
- } else
- BUG();
- }
tmp = bhs[bhind++];
+ tmp->b_dev = B_FREE;
tmp->b_size = size;
set_bh_page(tmp, map, offset);
tmp->b_this_page = tmp;
@@ -2087,6 +2221,9 @@
} else
set_bit(BH_Uptodate, &tmp->b_state);
+ length -= size;
+ offset += size;
+
atomic_inc(&iobuf->io_count);
submit_bh(rw, tmp);
/*
@@ -2101,11 +2238,7 @@
goto finished;
bhind = 0;
}
-
- skip_block:
- length -= size;
- offset += size;
-
+
if (offset >= PAGE_SIZE) {
offset = 0;
break;
@@ -2116,12 +2249,14 @@
/* Is there any IO still left to submit? */
if (bhind) {
+ int tmp_err;
kiobuf_wait_for_io(iobuf); /* wake-one */
- err = wait_kio(rw, bhind, bhs, size);
- if (err >= 0)
- transferred += err;
+ tmp_err = wait_kio(rw, bhind, bhs, size);
+ if (tmp_err >= 0)
+ transferred += tmp_err;
else
- goto finished;
+ if (!err)
+ err = tmp_err;
}
finished:
@@ -2159,6 +2294,7 @@
bh->b_blocknr = *(b++);
set_bit(BH_Mapped, &bh->b_state);
set_buffer_async_io(bh);
+ get_bh(bh);
bh = bh->b_this_page;
} while (bh != head);
@@ -2207,173 +2343,103 @@
return err;
}
-static inline void link_dev_buffers(struct page * page, struct buffer_head *head)
-{
- struct buffer_head *bh, *tail;
-
- bh = head;
- do {
- tail = bh;
- bh = bh->b_this_page;
- } while (bh);
- tail->b_this_page = head;
- page->buffers = head;
- page_cache_get(page);
-}
-
-/*
- * Create the page-cache page that contains the requested block
- */
-static struct page * grow_dev_page(struct block_device *bdev, unsigned long index, int size)
-{
- struct page * page;
- struct buffer_head *bh;
-
- page = find_or_create_page(bdev->bd_inode->i_mapping, index, GFP_NOFS);
- if (IS_ERR(page))
- return NULL;
-
- if (!PageLocked(page))
- BUG();
-
- bh = page->buffers;
- if (bh) {
- if (bh->b_size == size)
- return page;
- if (!try_to_free_buffers(page, GFP_NOFS))
- goto failed;
- }
-
- bh = create_buffers(page, size, 0);
- if (!bh)
- goto failed;
- link_dev_buffers(page, bh);
- return page;
-
-failed:
- UnlockPage(page);
- page_cache_release(page);
- return NULL;
-}
-
-static void hash_page_buffers(struct page *page, kdev_t dev, int block, int size)
-{
- struct buffer_head *head = page->buffers;
- struct buffer_head *bh = head;
- unsigned int uptodate;
-
- uptodate = 1 << BH_Mapped;
- if (Page_Uptodate(page))
- uptodate |= 1 << BH_Uptodate;
-
- write_lock(&hash_table_lock);
- do {
- if (!(bh->b_state & (1 << BH_Mapped))) {
- init_buffer(bh, NULL, NULL);
- bh->b_dev = dev;
- bh->b_blocknr = block;
- bh->b_state = uptodate;
- }
-
- /* Insert the buffer into the hash lists if necessary */
- if (!bh->b_pprev)
- __insert_into_hash_list(bh);
-
- block++;
- bh = bh->b_this_page;
- } while (bh != head);
- write_unlock(&hash_table_lock);
-}
-
/*
* Try to increase the number of buffers available: the size argument
* is used to determine what kind of buffers we want.
*/
-static int grow_buffers(kdev_t dev, unsigned long block, int size)
+static int grow_buffers(int size)
{
struct page * page;
- struct block_device *bdev;
- unsigned long index;
- int sizebits;
+ struct buffer_head *bh, *tmp;
+ struct buffer_head * insert_point;
+ int isize;
if ((size & 511) || (size > PAGE_SIZE)) {
printk(KERN_ERR "VFS: grow_buffers: size = %d\n",size);
return 0;
}
- sizebits = -1;
- do {
- sizebits++;
- } while ((size << sizebits) < PAGE_SIZE);
-
- index = block >> sizebits;
- block = index << sizebits;
-
- bdev = bdget(kdev_t_to_nr(dev));
- if (!bdev) {
- printk("No block device for %s\n", kdevname(dev));
- BUG();
- }
-
- /* Create a page with the proper size buffers.. */
- page = grow_dev_page(bdev, index, size);
- /* This is "wrong" - talk to Al Viro */
- atomic_dec(&bdev->bd_count);
+ page = alloc_page(GFP_NOFS);
if (!page)
- return 0;
+ goto out;
+ LockPage(page);
+ bh = create_buffers(page, size, 0);
+ if (!bh)
+ goto no_buffer_head;
- /* Hash in the buffers on the hash list */
- hash_page_buffers(page, dev, block, size);
- UnlockPage(page);
- page_cache_release(page);
+ isize = BUFSIZE_INDEX(size);
- /* We hashed up this page, so increment buffermem */
+ spin_lock(&free_list[isize].lock);
+ insert_point = free_list[isize].list;
+ tmp = bh;
+ while (1) {
+ if (insert_point) {
+ tmp->b_next_free = insert_point->b_next_free;
+ tmp->b_prev_free = insert_point;
+ insert_point->b_next_free->b_prev_free = tmp;
+ insert_point->b_next_free = tmp;
+ } else {
+ tmp->b_prev_free = tmp;
+ tmp->b_next_free = tmp;
+ }
+ insert_point = tmp;
+ if (tmp->b_this_page)
+ tmp = tmp->b_this_page;
+ else
+ break;
+ }
+ tmp->b_this_page = bh;
+ free_list[isize].list = bh;
+ spin_unlock(&free_list[isize].lock);
+
+ page->buffers = bh;
+ page->flags &= ~(1 << PG_referenced);
+ lru_cache_add(page);
+ UnlockPage(page);
atomic_inc(&buffermem_pages);
return 1;
+
+no_buffer_head:
+ UnlockPage(page);
+ page_cache_release(page);
+out:
+ return 0;
}
-static int sync_page_buffers(struct buffer_head *head, unsigned int gfp_mask)
+/*
+ * Sync all the buffers on one page..
+ *
+ * If we have old buffers that are locked, we'll
+ * wait on them, but we won't wait on the new ones
+ * we're writing out now.
+ *
+ * This all is required so that we can free up memory
+ * later.
+ *
+ * Wait:
+ * 0 - no wait (this does not get called - see try_to_free_buffers below)
+ * 1 - start IO for dirty buffers
+ * 2 - wait for completion of locked buffers
+ */
+static void sync_page_buffers(struct buffer_head *bh, unsigned int gfp_mask)
{
- struct buffer_head * bh = head;
- int tryagain = 0;
+ struct buffer_head * tmp = bh;
do {
- if (!buffer_dirty(bh) && !buffer_locked(bh))
- continue;
-
- /* Don't start IO first time around.. */
- if (!test_and_set_bit(BH_Wait_IO, &bh->b_state))
- continue;
-
- /* Second time through we start actively writing out.. */
- if (test_and_set_bit(BH_Lock, &bh->b_state)) {
- if (!test_bit(BH_launder, &bh->b_state))
- continue;
- wait_on_buffer(bh);
- tryagain = 1;
- continue;
- }
-
- if (!atomic_set_buffer_clean(bh)) {
- unlock_buffer(bh);
- continue;
- }
-
- __mark_buffer_clean(bh);
- get_bh(bh);
- set_bit(BH_launder, &bh->b_state);
- bh->b_end_io = end_buffer_io_sync;
- submit_bh(WRITE, bh);
- tryagain = 0;
- } while ((bh = bh->b_this_page) != head);
-
- return tryagain;
+ struct buffer_head *p = tmp;
+ tmp = tmp->b_this_page;
+ if (buffer_locked(p)) {
+ if (gfp_mask & __GFP_WAIT)
+ __wait_on_buffer(p);
+ } else if (buffer_dirty(p))
+ ll_rw_block(WRITE, 1, &p);
+ } while (tmp != bh);
}
/*
* Can the buffer be thrown out?
*/
-#define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock))
+#define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
#define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
/*
@@ -2390,10 +2456,13 @@
int try_to_free_buffers(struct page * page, unsigned int gfp_mask)
{
struct buffer_head * tmp, * bh = page->buffers;
+ int index = BUFSIZE_INDEX(bh->b_size);
+ int loop = 0;
cleaned_buffers_try_again:
spin_lock(&lru_list_lock);
write_lock(&hash_table_lock);
+ spin_lock(&free_list[index].lock);
tmp = bh;
do {
if (buffer_busy(tmp))
@@ -2403,18 +2472,18 @@
spin_lock(&unused_list_lock);
tmp = bh;
-
- /* if this buffer was hashed, this page counts as buffermem */
- if (bh->b_pprev)
- atomic_dec(&buffermem_pages);
do {
struct buffer_head * p = tmp;
tmp = tmp->b_this_page;
- if (p->b_dev == B_FREE) BUG();
-
- remove_inode_queue(p);
- __remove_from_queues(p);
+ /* The buffer can be either on the regular
+ * queues or on the free list..
+ */
+ if (p->b_dev != B_FREE) {
+ remove_inode_queue(p);
+ __remove_from_queues(p);
+ } else
+ __remove_from_free_list(p, index);
__put_unused_buffer_head(p);
} while (tmp != bh);
spin_unlock(&unused_list_lock);
@@ -2425,27 +2494,29 @@
/* And free the page */
page->buffers = NULL;
page_cache_release(page);
+ spin_unlock(&free_list[index].lock);
write_unlock(&hash_table_lock);
spin_unlock(&lru_list_lock);
return 1;
busy_buffer_page:
/* Uhhuh, start writeback so that we don't end up with all dirty pages */
+ spin_unlock(&free_list[index].lock);
write_unlock(&hash_table_lock);
spin_unlock(&lru_list_lock);
if (gfp_mask & __GFP_IO) {
- if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
- if (sync_page_buffers(bh, gfp_mask)) {
- /* no IO or waiting next time */
- gfp_mask = 0;
- goto cleaned_buffers_try_again;
- }
+ sync_page_buffers(bh, gfp_mask);
+ /* We waited synchronously, so we can free the buffers. */
+ if ((gfp_mask & __GFP_WAIT) && !loop) {
+ loop = 1;
+ goto cleaned_buffers_try_again;
}
+ wakeup_bdflush(0);
}
- if (balance_dirty_state() >= 0)
- wakeup_bdflush();
return 0;
}
+EXPORT_SYMBOL(try_to_free_buffers);
+EXPORT_SYMBOL(buffermem_pages);
/* ================== Debugging =================== */
@@ -2454,8 +2525,9 @@
#ifdef CONFIG_SMP
struct buffer_head * bh;
int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
+ int protected = 0;
int nlist;
- static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", };
+ static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
#endif
printk("Buffer memory: %6dkB\n",
@@ -2465,7 +2537,7 @@
if (!spin_trylock(&lru_list_lock))
return;
for(nlist = 0; nlist < NR_LIST; nlist++) {
- found = locked = dirty = used = lastused = 0;
+ found = locked = dirty = used = lastused = protected = 0;
bh = lru_list[nlist];
if(!bh) continue;
@@ -2473,6 +2545,8 @@
found++;
if (buffer_locked(bh))
locked++;
+ if (buffer_protected(bh))
+ protected++;
if (buffer_dirty(bh))
dirty++;
if (atomic_read(&bh->b_count))
@@ -2486,9 +2560,9 @@
buf_types[nlist], found, tmp);
}
printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
- "%d locked, %d dirty\n",
+ "%d locked, %d protected, %d dirty\n",
buf_types[nlist], found, size_buffers_type[nlist]>>10,
- used, lastused, locked, dirty);
+ used, lastused, locked, protected, dirty);
}
spin_unlock(&lru_list_lock);
#endif
@@ -2543,6 +2617,12 @@
for(i = 0; i < nr_hash; i++)
hash_table[i] = NULL;
+ /* Setup free lists. */
+ for(i = 0; i < NR_SIZES; i++) {
+ free_list[i].list = NULL;
+ free_list[i].lock = SPIN_LOCK_UNLOCKED;
+ }
+
/* Setup lru lists. */
for(i = 0; i < NR_LIST; i++)
lru_list[i] = NULL;
@@ -2557,11 +2637,69 @@
* a limited number of buffers to the disks and then go back to sleep again.
*/
+/* This is the _only_ function that deals with flushing async writes
+ to disk.
+ NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
+ as all dirty buffers lives _only_ in the DIRTY lru list.
+ As we never browse the LOCKED and CLEAN lru lists they are infact
+ completly useless. */
+static int flush_dirty_buffers(int check_flushtime)
+{
+ struct buffer_head * bh, *next;
+ int flushed = 0, i;
+
+ restart:
+ spin_lock(&lru_list_lock);
+ bh = lru_list[BUF_DIRTY];
+ if (!bh)
+ goto out_unlock;
+ for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
+ next = bh->b_next_free;
+
+ if (!buffer_dirty(bh)) {
+ __refile_buffer(bh);
+ continue;
+ }
+ if (buffer_locked(bh))
+ continue;
+
+ if (check_flushtime) {
+ /* The dirty lru list is chronologically ordered so
+ if the current bh is not yet timed out,
+ then also all the following bhs
+ will be too young. */
+ if (time_before(jiffies, bh->b_flushtime))
+ goto out_unlock;
+ } else {
+ if (++flushed > bdf_prm.b_un.ndirty)
+ goto out_unlock;
+ }
+
+ /* OK, now we are committed to write it out. */
+ get_bh(bh);
+ spin_unlock(&lru_list_lock);
+ ll_rw_block(WRITE, 1, &bh);
+ put_bh(bh);
+
+ if (current->need_resched)
+ schedule();
+ goto restart;
+ }
+ out_unlock:
+ spin_unlock(&lru_list_lock);
+
+ return flushed;
+}
+
DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
-void wakeup_bdflush(void)
+void wakeup_bdflush(int block)
{
- wake_up_interruptible(&bdflush_wait);
+ if (waitqueue_active(&bdflush_wait))
+ wake_up_interruptible(&bdflush_wait);
+
+ if (block)
+ flush_dirty_buffers(0);
}
/*
@@ -2579,18 +2717,9 @@
sync_supers(0);
unlock_kernel();
- for (;;) {
- struct buffer_head *bh;
-
- spin_lock(&lru_list_lock);
- bh = lru_list[BUF_DIRTY];
- if (!bh || time_before(jiffies, bh->b_flushtime))
- break;
- if (write_some_buffers(NODEV))
- continue;
- return 0;
- }
- spin_unlock(&lru_list_lock);
+ flush_dirty_buffers(1);
+ /* must really sync all the active I/O request to disk here */
+ run_task_queue(&tq_disk);
return 0;
}
@@ -2661,7 +2790,7 @@
int bdflush(void *startup)
{
struct task_struct *tsk = current;
-
+ int flushed;
/*
* We have a bare-bones task_struct, and really should fill
* in a few more things so "top" and /proc/2/{exe,root,cwd}
@@ -2684,9 +2813,15 @@
for (;;) {
CHECK_EMERGENCY_SYNC
- spin_lock(&lru_list_lock);
- if (!write_some_buffers(NODEV) || balance_dirty_state() < 0) {
- wait_for_some_buffers(NODEV);
+ flushed = flush_dirty_buffers(0);
+
+ /*
+ * If there are still a lot of dirty buffers around,
+ * skip the sleep and flush some more. Otherwise, we
+ * go to sleep waiting a wakeup.
+ */
+ if (!flushed || balance_dirty_state(NODEV) < 0) {
+ run_task_queue(&tq_disk);
interruptible_sleep_on(&bdflush_wait);
}
}
@@ -2717,8 +2852,6 @@
complete((struct completion *)startup);
for (;;) {
- wait_for_some_buffers(NODEV);
-
/* update interval */
interval = bdf_prm.b_un.interval;
if (interval) {
|